diff --git a/.bazelrc b/.bazelrc
index d4d7ad61867..27172e929b0 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -39,32 +39,46 @@ build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=0
 
 build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
 build:download_clang --define=using_clang=true
+build:download_clang --action_env TF_DOWNLOAD_CLANG=1
 # Instruct clang to use LLD for linking.
 # This only works with GPU builds currently, since Bazel sets -B/usr/bin in
 # auto-generated CPU crosstool, forcing /usr/bin/ld.lld to be preferred over
 # the downloaded one.
 build:download_clang_use_lld --linkopt='-fuse-ld=lld'
 
-build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+# This config refers to building with CUDA available. It does not necessarily
+# mean that we build CUDA op kernels.
+build:using_cuda --define=using_cuda=true
+build:using_cuda --action_env TF_NEED_CUDA=1
+build:using_cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
+
+# This config refers to building CUDA op kernels with nvcc.
+build:cuda --config=using_cuda
+build:cuda --define=using_cuda_nvcc=true
+
+# This config refers to building CUDA op kernels with clang.
+build:cuda_clang --config=using_cuda
+build:cuda_clang --define=using_cuda_clang=true
+build:cuda_clang --define=using_clang=true
+
+build:tensorrt --action_env TF_NEED_TENSORRT=1
 
 build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
 build:rocm --define=using_rocm=true --define=using_rocm_hipcc=true
-
-build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true --define=using_clang=true
+build:rocm --action_env TF_NEED_ROCM=1
 
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl --define=using_sycl=true --define=using_trisycl=false
+build:sycl --define=using_sycl=true
+build:sycl --action_env TF_NEED_OPENCL_SYCL=1
 
-build:sycl_nodouble --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_nodouble --define=using_sycl=true --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
+build:sycl_nodouble --config=sycl
+build:sycl_nodouble --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
 
-build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_asan --define=using_sycl=true --define=using_trisycl=false --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
+build:sycl_nodouble --config=sycl
+build:sycl_asan --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
 
-build:sycl_trisycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_trisycl --define=using_sycl=true --define=using_trisycl=true
+build:sycl_nodouble --config=sycl
+build:sycl_trisycl --define=using_trisycl=true
 
 # Options extracted from configure script
 build:gdr --define=with_gdr_support=true
@@ -87,6 +101,9 @@ build --spawn_strategy=standalone
 build --strategy=Genrule=standalone
 build -c opt
 
+# Make Bazel print out all options from rc files.
+build --announce_rc
+
 # Other build flags.
 build --define=grpc_no_ares=true
 
@@ -97,8 +114,7 @@ build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
 # Build TF with C++ 17 features.
 build:c++17 --cxxopt=-std=c++1z
 build:c++17 --cxxopt=-stdlib=libc++
-build:c++1z --cxxopt=-std=c++1z
-build:c++1z --cxxopt=-stdlib=libc++
+build:c++1z --config=c++17
 
 # Default paths for TF_SYSTEM_LIBS
 build --define=PREFIX=/usr
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 73782143a3d..b460bdde24f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -38,7 +38,13 @@ working on getting your pull request submitted to our internal repository. After
 the change has been submitted internally, your pull request will be merged
 automatically on GitHub.
 
-If you want to contribute but you're not sure where to start, take a look at the
+If you want to contribute, start working through the TensorFlow codebase,
+navigate to the
+[Github "issues" tab](https://github.com/tensorflow/tensorflow/issues) and start
+looking through interesting issues. If you are not sure of where to start, then
+start by trying one of the smaller/easier issues here i.e.
+[issues with the "good first issue" label](https://github.com/tensorflow/tensorflow/labels/good%20first%20issue)
+and then take a look at the
 [issues with the "contributions welcome" label](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
 These are issues that we believe are particularly well suited for outside
 contributions, often because we probably won't get to them right now. If you
diff --git a/configure.py b/configure.py
index 2120a4b27d6..8d6772c199b 100644
--- a/configure.py
+++ b/configure.py
@@ -403,7 +403,8 @@ def set_action_env_var(environ_cp,
                        enabled_by_default,
                        question=None,
                        yes_reply=None,
-                       no_reply=None):
+                       no_reply=None,
+                       bazel_config_name=None):
   """Set boolean action_env variable.
 
   Ask user if query_item will be enabled. Default is used if no input is given.
@@ -418,12 +419,16 @@ def set_action_env_var(environ_cp,
     question: optional string for how to ask for user input.
     yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
+    bazel_config_name: adding config to .bazelrc instead of action_env.
   """
   var = int(
       get_var(environ_cp, var_name, query_item, enabled_by_default, question,
               yes_reply, no_reply))
 
-  write_action_env_to_bazelrc(var_name, var)
+  if not bazel_config_name:
+    write_action_env_to_bazelrc(var_name, var)
+  elif var:
+    write_to_bazelrc('build --config=%s' % bazel_config_name)
   environ_cp[var_name] = str(var)
 
 
@@ -543,7 +548,8 @@ def set_tf_cuda_clang(environ_cp):
       False,
       question=question,
       yes_reply=yes_reply,
-      no_reply=no_reply)
+      no_reply=no_reply,
+      bazel_config_name='cuda_clang')
 
 
 def set_tf_download_clang(environ_cp):
@@ -558,7 +564,8 @@ def set_tf_download_clang(environ_cp):
       False,
       question=question,
       yes_reply=yes_reply,
-      no_reply=no_reply)
+      no_reply=no_reply,
+      bazel_config_name='download_clang')
 
 
 def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var,
@@ -782,8 +789,8 @@ def get_ndk_api_level(environ_cp, android_ndk_home_path):
     print('WARNING: The NDK version in %s is %s, which is not '
           'supported by Bazel (officially supported versions: %s). Please use '
           'another version. Compiling Android targets may result in confusing '
-          'errors.\n' % (android_ndk_home_path, ndk_version,
-                         _SUPPORTED_ANDROID_NDK_VERSIONS))
+          'errors.\n' %
+          (android_ndk_home_path, ndk_version, _SUPPORTED_ANDROID_NDK_VERSIONS))
 
   # Now grab the NDK API level to use. Note that this is different from the
   # SDK API level, as the NDK API level is effectively the *min* target SDK
@@ -952,6 +959,7 @@ def set_tf_nccl_version(environ_cp):
                                                     ask_nccl_version, '')
   environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
 
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1293,9 +1301,6 @@ def configure_ios():
   """
   if not is_macos():
     return
-  if _TF_CURRENT_BAZEL_VERSION is None or _TF_CURRENT_BAZEL_VERSION < 23000:
-    print(
-        'Building Bazel rules on Apple platforms requires Bazel 0.23 or later.')
   for filepath in APPLE_BAZEL_FILES:
     existing_filepath = os.path.join(_TF_WORKSPACE_ROOT, filepath + '.apple')
     renamed_filepath = os.path.join(_TF_WORKSPACE_ROOT, filepath)
@@ -1386,7 +1391,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  current_bazel_version = check_bazel_version('0.24.1', '0.25.2')
+  current_bazel_version = check_bazel_version('0.24.1', '0.25.3')
   _TF_CURRENT_BAZEL_VERSION = convert_version_to_int(current_bazel_version)
 
   reset_tf_configure_bazelrc()
@@ -1422,7 +1427,12 @@ def main():
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 xla_enabled_by_default, 'xla')
 
-  set_action_env_var(environ_cp, 'TF_NEED_OPENCL_SYCL', 'OpenCL SYCL', False)
+  set_action_env_var(
+      environ_cp,
+      'TF_NEED_OPENCL_SYCL',
+      'OpenCL SYCL',
+      False,
+      bazel_config_name='sycl')
   if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
     set_host_cxx_compiler(environ_cp)
     set_host_c_compiler(environ_cp)
@@ -1432,30 +1442,44 @@ def main():
     else:
       set_trisycl_include_dir(environ_cp)
 
-  set_action_env_var(environ_cp, 'TF_NEED_ROCM', 'ROCm', False)
+  set_action_env_var(
+      environ_cp, 'TF_NEED_ROCM', 'ROCm', False, bazel_config_name='rocm')
   if (environ_cp.get('TF_NEED_ROCM') == '1' and
       'LD_LIBRARY_PATH' in environ_cp and
       environ_cp.get('LD_LIBRARY_PATH') != '1'):
     write_action_env_to_bazelrc('LD_LIBRARY_PATH',
                                 environ_cp.get('LD_LIBRARY_PATH'))
 
-  set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)
+  environ_cp['TF_NEED_CUDA'] = str(
+      int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)))
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
 
-    set_action_env_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False)
+    set_action_env_var(
+        environ_cp,
+        'TF_NEED_TENSORRT',
+        'TensorRT',
+        False,
+        bazel_config_name='tensorrt')
 
     environ_save = dict(environ_cp)
     for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
 
       if validate_cuda_config(environ_cp):
         cuda_env_names = [
-            'TF_CUDA_VERSION', 'TF_CUBLAS_VERSION', 'TF_CUDNN_VERSION',
-            'TF_TENSORRT_VERSION', 'TF_NCCL_VERSION', 'TF_CUDA_PATHS',
+            'TF_CUDA_VERSION',
+            'TF_CUBLAS_VERSION',
+            'TF_CUDNN_VERSION',
+            'TF_TENSORRT_VERSION',
+            'TF_NCCL_VERSION',
+            'TF_CUDA_PATHS',
             # Items below are for backwards compatibility when not using
             # TF_CUDA_PATHS.
-            'CUDA_TOOLKIT_PATH', 'CUDNN_INSTALL_PATH', 'NCCL_INSTALL_PATH',
-            'NCCL_HDR_PATH', 'TENSORRT_INSTALL_PATH'
+            'CUDA_TOOLKIT_PATH',
+            'CUDNN_INSTALL_PATH',
+            'NCCL_INSTALL_PATH',
+            'NCCL_HDR_PATH',
+            'TENSORRT_INSTALL_PATH'
         ]
         # Note: set_action_env_var above already writes to bazelrc.
         for name in cuda_env_names:
@@ -1506,8 +1530,6 @@ def main():
     # CUDA not required. Ask whether we should download the clang toolchain and
     # use it for the CPU build.
     set_tf_download_clang(environ_cp)
-    if environ_cp.get('TF_DOWNLOAD_CLANG') == '1':
-      write_to_bazelrc('build --config=download_clang')
 
   # SYCL / ROCm / CUDA are mutually exclusive.
   # At most 1 GPU platform can be configured.
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 8a14abc3c2c..a83ff3a16c2 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -59,7 +59,7 @@ except ImportError:
 
 from tensorflow.python.util.lazy_loader import LazyLoader  # pylint: disable=g-import-not-at-top
 _CONTRIB_WARNING = """
-WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
+The TensorFlow contrib module will not be included in TensorFlow 2.0.
 For more information, please see:
   * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
   * https://github.com/tensorflow/addons
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 6928cf5d0ac..99eb28c1295 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -21,6 +21,9 @@ filegroup(
     srcs = [
         "c_api.h",
         "c_api_experimental.h",
+        "tf_attrtype.h",
+        "tf_datatype.h",
+        "tf_status.h",
     ],
     visibility = ["//tensorflow:__subpackages__"],
 )
@@ -51,6 +54,8 @@ tf_cuda_library(
     hdrs = [
         "c_api.h",
         "c_api_internal.h",
+        "tf_datatype.h",
+        "tf_status.h",
     ],
     visibility = [
         "//tensorflow:internal",
@@ -61,6 +66,7 @@ tf_cuda_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            ":tf_attrtype",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
@@ -71,16 +77,26 @@ tf_cuda_library(
     }),
 )
 
+cc_library(
+    name = "tf_attrtype",
+    hdrs = ["tf_attrtype.h"],
+    visibility = ["//visibility:public"],
+)
+
 tf_cuda_library(
     name = "c_api",
     hdrs = [
         "c_api.h",
+        "tf_attrtype.h",
+        "tf_datatype.h",
+        "tf_status.h",
     ],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":c_api_no_xla",
         ":c_api_internal",
+        ":tf_attrtype",
     ] + select({
         "//tensorflow:with_xla_support": [
             "//tensorflow/compiler/tf2xla:xla_compiler",
@@ -96,14 +112,21 @@ tf_cuda_library(
         "c_api.cc",
         "c_api_function.cc",
     ],
-    hdrs = ["c_api.h"],
+    hdrs = [
+        "c_api.h",
+    ],
     copts = tf_copts(),
     visibility = ["//tensorflow/c:__subpackages__"],
-    deps = [":c_api_internal"] + select({
+    deps = [
+        ":c_api_internal",
+        ":tf_attrtype",
+        ":tf_datatype",
+    ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            ":tf_status",
             "@com_google_absl//absl/strings",
             "//tensorflow/cc/saved_model:loader_lite",
             "//tensorflow/cc:gradients",
@@ -124,6 +147,37 @@ tf_cuda_library(
     }),
 )
 
+cc_library(
+    name = "tf_status",
+    srcs = ["tf_status.cc"],
+    hdrs = ["tf_status.h"],
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/c:c_api_internal",
+            "//tensorflow/core:lib",
+        ],
+    }),
+)
+
+cc_library(
+    name = "tf_datatype",
+    srcs = ["tf_datatype.cc"],
+    hdrs = ["tf_datatype.h"],
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
 tf_cuda_library(
     name = "c_api_experimental",
     srcs = [
@@ -137,6 +191,7 @@ tf_cuda_library(
     deps = [
         ":c_api",
         ":c_api_internal",
+        ":checkpoint_reader",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/compiler/jit:flags",
@@ -151,15 +206,6 @@ tf_cuda_library(
     ],
 )
 
-cc_library(
-    name = "c_api_headers",
-    hdrs = [
-        "c_api.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 exports_files(
     [
         "version_script.lds",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 21d72ac96b5..4f519a7bd11 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/match.h"
 // Required for IS_MOBILE_PLATFORM
 #include "tensorflow/core/platform/platform.h"  // NOLINT
 
@@ -97,7 +98,6 @@ using tensorflow::TensorId;
 using tensorflow::TensorShape;
 using tensorflow::TensorShapeProto;
 using tensorflow::VersionDef;
-using tensorflow::error::Code;
 using tensorflow::errors::FailedPrecondition;
 using tensorflow::errors::InvalidArgument;
 using tensorflow::gtl::ArraySlice;
@@ -108,34 +108,6 @@ extern "C" {
 // --------------------------------------------------------------------------
 const char* TF_Version() { return TF_VERSION_STRING; }
 
-// --------------------------------------------------------------------------
-size_t TF_DataTypeSize(TF_DataType dt) {
-  return static_cast<size_t>(
-      tensorflow::DataTypeSize(static_cast<DataType>(dt)));
-}
-
-// --------------------------------------------------------------------------
-
-TF_Status* TF_NewStatus() { return new TF_Status; }
-
-void TF_DeleteStatus(TF_Status* s) { delete s; }
-
-void TF_SetStatus(TF_Status* s, TF_Code code, const char* msg) {
-  if (code == TF_OK) {
-    s->status = Status::OK();
-    return;
-  }
-  s->status = Status(static_cast<Code>(code), tensorflow::StringPiece(msg));
-}
-
-TF_Code TF_GetCode(const TF_Status* s) {
-  return static_cast<TF_Code>(s->status.code());
-}
-
-const char* TF_Message(const TF_Status* s) {
-  return s->status.error_message().c_str();
-}
-
 // --------------------------------------------------------------------------
 
 namespace {
@@ -1697,7 +1669,7 @@ TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper,
       if (metadata.list_size == 0) {
         for (int i = 0; i < oper->node.op_def().attr_size(); ++i) {
           const auto& a = oper->node.op_def().attr(i);
-          if (a.name().compare(attr_name) != 0) continue;
+          if (a.name() != attr_name) continue;
           const string& typestr = a.type();
           if (typestr == "list(string)") {
             metadata.type = TF_ATTR_STRING;
@@ -2517,8 +2489,7 @@ void TF_AddGradientsWithPrefix(TF_Graph* g, const char* prefix, TF_Output* y,
       // used in this graph
       for (const auto& pair : g->name_map) {
         const string& name = pair.first;
-        if (name.compare(prefix) == 0 ||
-            tensorflow::str_util::StartsWith(name, prefix_cmp)) {
+        if ((name == prefix) || absl::StartsWith(name, prefix_cmp)) {
           status->status = InvalidArgument(
               "prefix [", prefix,
               "] conflicts with existing node in the graph named [", name, "]");
@@ -2548,8 +2519,7 @@ void TF_AddGradientsWithPrefix(TF_Graph* g, const char* prefix, TF_Output* y,
       // Adding the gradients to the graph can alter the prefix to prevent
       // name collisions only if this prefix has not been provided explicitly
       // by the user. If it was provided, assert that it remained intact.
-      if (prefix != nullptr &&
-          !tensorflow::str_util::StartsWith(n->name(), prefix_cmp)) {
+      if (prefix != nullptr && !absl::StartsWith(n->name(), prefix_cmp)) {
         status->status = tensorflow::errors::Internal(
             "BUG: The gradients prefix have been unexpectedly altered when "
             "adding the nodes to the graph. This is a bug. Please file an "
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 051de3a7dc0..9a538cb98db 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include "tensorflow/c/tf_attrtype.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_status.h"
+
 // --------------------------------------------------------------------------
 // C API for TensorFlow.
 //
@@ -69,7 +73,7 @@ limitations under the License.
 // .dylib, .dll).
 // This duplicates the TF_EXPORT macro definition in
 // tensorflow/core/platform/macros.h in order to keep this .h file independent
-// of any other includes.$a
+// of any other includes.
 #ifdef SWIG
 #define TF_CAPI_EXPORT
 #else
@@ -93,89 +97,6 @@ extern "C" {
 // TensorFlow library. TensorFlow using semantic versioning.
 TF_CAPI_EXPORT extern const char* TF_Version(void);
 
-// --------------------------------------------------------------------------
-// TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
-// The enum values here are identical to corresponding values in types.proto.
-typedef enum TF_DataType {
-  TF_FLOAT = 1,
-  TF_DOUBLE = 2,
-  TF_INT32 = 3,  // Int32 tensors are always in 'host' memory.
-  TF_UINT8 = 4,
-  TF_INT16 = 5,
-  TF_INT8 = 6,
-  TF_STRING = 7,
-  TF_COMPLEX64 = 8,  // Single-precision complex
-  TF_COMPLEX = 8,    // Old identifier kept for API backwards compatibility
-  TF_INT64 = 9,
-  TF_BOOL = 10,
-  TF_QINT8 = 11,     // Quantized int8
-  TF_QUINT8 = 12,    // Quantized uint8
-  TF_QINT32 = 13,    // Quantized int32
-  TF_BFLOAT16 = 14,  // Float32 truncated to 16 bits.  Only for cast ops.
-  TF_QINT16 = 15,    // Quantized int16
-  TF_QUINT16 = 16,   // Quantized uint16
-  TF_UINT16 = 17,
-  TF_COMPLEX128 = 18,  // Double-precision complex
-  TF_HALF = 19,
-  TF_RESOURCE = 20,
-  TF_VARIANT = 21,
-  TF_UINT32 = 22,
-  TF_UINT64 = 23,
-} TF_DataType;
-
-// TF_DataTypeSize returns the sizeof() for the underlying type corresponding
-// to the given TF_DataType enum value. Returns 0 for variable length types
-// (eg. TF_STRING) or on failure.
-TF_CAPI_EXPORT extern size_t TF_DataTypeSize(TF_DataType dt);
-
-// --------------------------------------------------------------------------
-// TF_Code holds an error code.  The enum values here are identical to
-// corresponding values in error_codes.proto.
-typedef enum TF_Code {
-  TF_OK = 0,
-  TF_CANCELLED = 1,
-  TF_UNKNOWN = 2,
-  TF_INVALID_ARGUMENT = 3,
-  TF_DEADLINE_EXCEEDED = 4,
-  TF_NOT_FOUND = 5,
-  TF_ALREADY_EXISTS = 6,
-  TF_PERMISSION_DENIED = 7,
-  TF_UNAUTHENTICATED = 16,
-  TF_RESOURCE_EXHAUSTED = 8,
-  TF_FAILED_PRECONDITION = 9,
-  TF_ABORTED = 10,
-  TF_OUT_OF_RANGE = 11,
-  TF_UNIMPLEMENTED = 12,
-  TF_INTERNAL = 13,
-  TF_UNAVAILABLE = 14,
-  TF_DATA_LOSS = 15,
-} TF_Code;
-
-// --------------------------------------------------------------------------
-// TF_Status holds error information.  It either has an OK code, or
-// else an error code with an associated error message.
-typedef struct TF_Status TF_Status;
-
-// Return a new status object.
-TF_CAPI_EXPORT extern TF_Status* TF_NewStatus(void);
-
-// Delete a previously created status object.
-TF_CAPI_EXPORT extern void TF_DeleteStatus(TF_Status*);
-
-// Record <code, msg> in *s.  Any previous information is lost.
-// A common use is to clear a status: TF_SetStatus(s, TF_OK, "");
-TF_CAPI_EXPORT extern void TF_SetStatus(TF_Status* s, TF_Code code,
-                                        const char* msg);
-
-// Return the code record in *s.
-TF_CAPI_EXPORT extern TF_Code TF_GetCode(const TF_Status* s);
-
-// Return a pointer to the (null-terminated) error message in *s.  The
-// return value points to memory that is only usable until the next
-// mutation to *s.  Always returns an empty string if TF_GetCode(s) is
-// TF_OK.
-TF_CAPI_EXPORT extern const char* TF_Message(const TF_Status* s);
-
 // --------------------------------------------------------------------------
 // TF_Buffer holds a pointer to a block of data and its associated length.
 // Typically, the data consists of a serialized protocol buffer, but other data
@@ -686,19 +607,6 @@ TF_CAPI_EXPORT extern int TF_OperationGetControlOutputs(
     TF_Operation* oper, TF_Operation** control_outputs,
     int max_control_outputs);
 
-// TF_AttrType describes the type of the value of an attribute on an operation.
-typedef enum TF_AttrType {
-  TF_ATTR_STRING = 0,
-  TF_ATTR_INT = 1,
-  TF_ATTR_FLOAT = 2,
-  TF_ATTR_BOOL = 3,
-  TF_ATTR_TYPE = 4,
-  TF_ATTR_SHAPE = 5,
-  TF_ATTR_TENSOR = 6,
-  TF_ATTR_PLACEHOLDER = 7,
-  TF_ATTR_FUNC = 8,
-} TF_AttrType;
-
 // TF_AttrMetadata describes the value of an attribute on an operation.
 typedef struct TF_AttrMetadata {
   // A boolean: 1 if the attribute value is a list, 0 otherwise.
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 726ce2784ae..246fa91eccd 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/compiler/jit/flags.h"
@@ -37,6 +38,7 @@ using tensorflow::FunctionDef;
 using tensorflow::Node;
 using tensorflow::NodeBuilder;
 using tensorflow::Status;
+using tensorflow::errors::InvalidArgument;
 
 namespace {
 typedef std::unique_ptr<TF_Function, decltype(&TF_DeleteFunction)>
@@ -149,7 +151,7 @@ const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
 }
 
 char* TF_FunctionDebugString(TF_Function* func, size_t* len) {
-  const auto& debug_str = func->fdef.DebugString();
+  const auto& debug_str = DebugString(func->fdef);
   *len = debug_str.size();
   char* ret = static_cast<char*>(malloc(*len + 1));
   memcpy(ret, debug_str.c_str(), *len + 1);
@@ -576,6 +578,73 @@ void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) {
   status->status = tensorflow::errors::Internal(errMsg);
 }
 
+struct TF_CheckpointReader : public tensorflow::checkpoint::CheckpointReader {
+  using tensorflow::checkpoint::CheckpointReader::CheckpointReader;
+  std::vector<std::string> variable_list;
+};
+
+TF_CheckpointReader* TF_NewCheckpointReader(const char* filename,
+                                            TF_Status* status) {
+  TF_CheckpointReader* reader = new TF_CheckpointReader(filename, status);
+  if (!status->status.ok()) return nullptr;
+  const auto& m = reader->GetVariableToDataTypeMap();
+  for (auto it = m.begin(); it != m.end(); ++it)
+    reader->variable_list.push_back(it->first);
+  std::sort(reader->variable_list.begin(), reader->variable_list.end());
+  return reader;
+}
+
+void TF_DeleteCheckpointReader(TF_CheckpointReader* reader) { delete reader; }
+
+int TF_CheckpointReaderHasTensor(TF_CheckpointReader* reader,
+                                 const char* name) {
+  return reader->HasTensor(name);
+}
+
+const char* TF_CheckpointReaderGetVariable(TF_CheckpointReader* reader,
+                                           int index) {
+  return reader->variable_list[index].c_str();
+}
+
+int TF_CheckpointReaderSize(TF_CheckpointReader* reader) {
+  return reader->variable_list.size();
+}
+
+TF_DataType TF_CheckpointReaderGetVariableDataType(TF_CheckpointReader* reader,
+                                                   const char* name) {
+  const auto& m = reader->GetVariableToDataTypeMap();
+  return static_cast<TF_DataType>(m.at(name));
+}
+
+TF_Tensor* TF_CheckpointReaderGetTensor(TF_CheckpointReader* reader,
+                                        const char* name, TF_Status* status) {
+  std::unique_ptr<tensorflow::Tensor> tensor;
+  reader->GetTensor(name, &tensor, status);
+  if (!status->status.ok()) return nullptr;
+  return tensorflow::TF_TensorFromTensor(*tensor.get(), status);
+}
+
+void TF_CheckpointReaderGetVariableShape(TF_CheckpointReader* reader,
+                                         const char* name, int64_t* dims,
+                                         int num_dims, TF_Status* status) {
+  const auto& shape = reader->GetVariableToShapeMap().at(name);
+  int rank = shape.dims();
+  if (num_dims != rank) {
+    status->status = InvalidArgument("Expected rank is ", num_dims,
+                                     " but actual rank is ", rank);
+    return;
+  }
+  for (int i = 0; i < num_dims; i++) {
+    dims[i] = shape.dim_size(i);
+  }
+}
+
+int TF_CheckpointReaderGetVariableNumDims(TF_CheckpointReader* reader,
+                                          const char* name) {
+  const auto& m = reader->GetVariableToShapeMap();
+  return m.at(name).dims();
+}
+
 // This builder is used in the eager API to build a NodeDef.
 struct TF_AttrBuilder : public tensorflow::AttrBuilder {
   using tensorflow::AttrBuilder::AttrBuilder;
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 795768a1415..25056904423 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -208,6 +208,34 @@ TF_CAPI_EXPORT extern void TFE_ExecuteOpNotificationWaitAndDelete(
 TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
                                                       const char* errMsg);
 
+// TF_NewCheckpointReader() return the CheckpointReader that can be use to
+// investigate or load the variable from the checkpoint file
+typedef struct TF_CheckpointReader TF_CheckpointReader;
+TF_CAPI_EXPORT extern TF_CheckpointReader* TF_NewCheckpointReader(
+    const char* filename, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_DeleteCheckpointReader(
+    TF_CheckpointReader* reader);
+TF_CAPI_EXPORT extern int TF_CheckpointReaderHasTensor(
+    TF_CheckpointReader* reader, const char* name);
+// Get the variable name at the given index
+TF_CAPI_EXPORT extern const char* TF_CheckpointReaderGetVariable(
+    TF_CheckpointReader* reader, int index);
+// Get the number of variable in the checkpoint
+TF_CAPI_EXPORT extern int TF_CheckpointReaderSize(TF_CheckpointReader* reader);
+// Get the DataType of a variable
+TF_CAPI_EXPORT extern TF_DataType TF_CheckpointReaderGetVariableDataType(
+    TF_CheckpointReader* reader, const char* name);
+// Read the shape of a variable and write to `dims`
+TF_CAPI_EXPORT extern void TF_CheckpointReaderGetVariableShape(
+    TF_CheckpointReader* reader, const char* name, int64_t* dims, int num_dims,
+    TF_Status* status);
+// Get the number of dimension of a variable
+TF_CAPI_EXPORT extern int TF_CheckpointReaderGetVariableNumDims(
+    TF_CheckpointReader* reader, const char* name);
+// Load the weight of a variable
+TF_CAPI_EXPORT extern TF_Tensor* TF_CheckpointReaderGetTensor(
+    TF_CheckpointReader* reader, const char* name, TF_Status* status);
+
 // TF_NewAttrBuilder() returns an object that you can set attributes on as
 // though it were an op. This allows querying properties of that op for
 // type-checking purposes like if the op will run on a particular device type.
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index 6eb289107c5..55f3a8599fd 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -62,8 +62,8 @@ protocol: "grpc"
   TF_Buffer* null_result =
       TFE_GetServerDef(malformed_text_proto.c_str(), status);
   EXPECT_NE(TF_GetCode(status), TF_OK);
-  EXPECT_TRUE(tensorflow::str_util::StrContains(
-      TF_Message(status), "Invalid text proto for ServerDef"));
+  EXPECT_TRUE(absl::StrContains(TF_Message(status),
+                                "Invalid text proto for ServerDef"));
   EXPECT_EQ(null_result, nullptr);
 
   // Cleanup
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index 760f14cac5b..847a81f5424 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -253,7 +253,7 @@ class CApiFunctionTest : public ::testing::Test {
                        const std::unordered_set<string>& nodes) {
     ASSERT_EQ(nodes.size(), fdef.node_def_size())
         << "Got unexpected number of nodes. Expected: ["
-        << str_util::Join(nodes, ", ")
+        << absl::StrJoin(nodes, ", ")
         << "] Actual nodes in fdef: " << fdef.DebugString();
     for (const NodeDef& node_def : fdef.node_def()) {
       ASSERT_TRUE(nodes.find(node_def.name()) != nodes.end())
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 2be03bf0de6..49076039fa7 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -56,7 +56,7 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(str_util::StrContains(s, expected))
+  EXPECT_TRUE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 8c2be2af3e0..0db85a17802 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -1,6 +1,8 @@
 # Experimental extensions to the C API for eager execution of kernels.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 0b86a78d41e..d8476bec2e4 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -30,7 +30,9 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/platform/host_info.h"
+#include "tensorflow/core/platform/platform.h"  // NOLINT
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
@@ -135,11 +137,12 @@ tensorflow::Status CreateRemoteContexts(
     const std::vector<string>& remote_workers, int64 rendezvous_id,
     int keep_alive_secs, const tensorflow::ServerDef& server_def,
     tensorflow::eager::EagerClientCache* remote_eager_workers, bool async,
+    const tensorflow::eager::CreateContextRequest& base_request,
     tensorflow::gtl::FlatMap<string, tensorflow::uint64>* remote_contexts) {
   for (int i = 0; i < remote_workers.size(); i++) {
     const string& remote_worker = remote_workers[i];
 
-    tensorflow::eager::CreateContextRequest request;
+    tensorflow::eager::CreateContextRequest request(base_request);
     tensorflow::eager::CreateContextResponse response;
     request.set_rendezvous_id(rendezvous_id);
     tensorflow::DeviceNameUtils::ParsedName parsed_name;
@@ -221,6 +224,23 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
       remote_workers, grpc_server->master_env()->worker_cache,
       &remote_device_mgr));
 
+  std::vector<tensorflow::DeviceAttributes> cluster_device_attributes;
+  remote_device_mgr->ListDeviceAttributes(&cluster_device_attributes);
+
+  std::vector<tensorflow::DeviceAttributes> local_device_attributes;
+  grpc_server->worker_env()->device_mgr->ListDeviceAttributes(
+      &local_device_attributes);
+
+  // This request make sure that we can create Rendevzous properly between
+  // Local and Remote context.
+  tensorflow::eager::CreateContextRequest base_request;
+  for (const auto& da : cluster_device_attributes) {
+    *base_request.add_cluster_device_attributes() = da;
+  }
+  for (const auto& da : local_device_attributes) {
+    *base_request.add_cluster_device_attributes() = da;
+  }
+
   std::shared_ptr<tensorflow::GrpcChannelCache> channel_cache =
       grpc_server->channel_cache();
   std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers(
@@ -230,14 +250,16 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
   tensorflow::gtl::FlatMap<string, tensorflow::uint64> remote_contexts;
   LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts(
       remote_workers, rendezvous_id, keep_alive_secs, server_def,
-      remote_eager_workers.get(), ctx->context->Async(), &remote_contexts));
+      remote_eager_workers.get(), ctx->context->Async(), base_request,
+      &remote_contexts));
 
   tensorflow::RemoteRendezvous* r =
       grpc_server->worker_env()->rendezvous_mgr->Find(rendezvous_id);
 
   auto session_name = tensorflow::strings::StrCat("eager_", rendezvous_id);
   TF_RETURN_IF_ERROR(grpc_server->worker_env()->session_mgr->CreateSession(
-      session_name, server_def, true));
+      session_name, server_def, base_request.cluster_device_attributes(),
+      true));
 
   std::shared_ptr<tensorflow::WorkerSession> worker_session;
   TF_RETURN_IF_ERROR(
@@ -250,9 +272,10 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
   auto* device_mgr = grpc_server->worker_env()->device_mgr;
 
   return ctx->context->InitializeRemote(
-      std::move(server), std::move(remote_eager_workers),
-      std::move(remote_device_mgr), remote_contexts, r, device_mgr,
-      keep_alive_secs);
+      std::move(server), grpc_server->worker_env(), worker_session,
+      std::move(remote_eager_workers), std::move(remote_device_mgr),
+      remote_contexts, r, device_mgr, keep_alive_secs,
+      worker_session->cluster_flr.get());
 #undef LOG_AND_RETURN_IF_ERROR
 }
 #endif  // !IS_MOBILE_PLATFORM
@@ -970,6 +993,23 @@ const tensorflow::Tensor* TFE_TensorHandleUnderlyingTensorInHostMemory(
   return t;
 }
 
+TFE_TensorHandle* TFE_TensorHandleMaybeCopyToHostCPU(TFE_TensorHandle* h,
+                                                     TF_Status* status) {
+  // TensorHandles created by PyFuncOp lack context and therefore could
+  // not be copied.
+  if (!h->handle->OnHostCPU() && h->handle->Context() != nullptr) {
+    tensorflow::TensorHandle* handle;
+    status->status = tensorflow::EagerCopyToDevice(
+        h->handle, h->handle->Context(), "CPU:0", &handle);
+    if (status->status.ok()) {
+      return new TFE_TensorHandle(handle);
+    } else {
+      return nullptr;
+    }
+  }
+  return h;
+}
+
 void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
                                   TF_Status* status) {
   TFE_ContextAsyncWait(ctx, status);
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index d5223e63f13..076760161e1 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -462,6 +462,9 @@ class Tensor;
 
 const tensorflow::Tensor* TFE_TensorHandleUnderlyingTensorInHostMemory(
     TFE_TensorHandle* h, TF_Status* status);
+
+TFE_TensorHandle* TFE_TensorHandleMaybeCopyToHostCPU(TFE_TensorHandle* h,
+                                                     TF_Status* status);
 TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t);
 #endif
 
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index b4192716c4f..eaa520d72cc 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -78,7 +78,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
         status->status = tensorflow::Status::OK();
       } else {
         VLOG(3) << "Fully padded shape of ["
-                << tensorflow::str_util::Join(shape_to_log, ", ") << "] is "
+                << absl::StrJoin(shape_to_log, ", ") << "] is "
                 << padded_shape.DebugString();
       }
     }
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index 4e48a7591a9..53984c0e6c0 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 namespace {
 
 static bool HasSubstr(absl::string_view base, absl::string_view substr) {
-  bool ok = str_util::StrContains(base, substr);
+  bool ok = absl::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 57aa71d5b3b..57bea7311e6 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -1408,29 +1408,34 @@ void FunctionDefAndExecute(bool async) {
                             status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  TFE_TensorHandle* m = TestMatrixTensorHandle();
-  TFE_TensorHandle* retval[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Op* op = TFE_NewOp(ctx, "MatMulFunction", status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_OpAddInput(op, m, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_Execute(op, &retval[0], &num_retvals, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  ASSERT_EQ(1, num_retvals);
-  TFE_DeleteOp(op);
-  TFE_DeleteTensorHandle(m);
-  TF_Tensor* t = TFE_TensorHandleResolve(retval[0], status);
-  TFE_DeleteTensorHandle(retval[0]);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  float product[4] = {0};
-  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
-  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
-  TF_DeleteTensor(t);
-  EXPECT_EQ(7, product[0]);
-  EXPECT_EQ(10, product[1]);
-  EXPECT_EQ(15, product[2]);
-  EXPECT_EQ(22, product[3]);
+  for (bool clear_cache : {true, false, true}) {
+    if (clear_cache) {
+      TFE_ContextClearCaches(ctx);
+    }
+    TFE_TensorHandle* m = TestMatrixTensorHandle();
+    TFE_TensorHandle* retval[1] = {nullptr};
+    int num_retvals = 1;
+    TFE_Op* op = TFE_NewOp(ctx, "MatMulFunction", status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_OpAddInput(op, m, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_Execute(op, &retval[0], &num_retvals, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    ASSERT_EQ(1, num_retvals);
+    TFE_DeleteOp(op);
+    TFE_DeleteTensorHandle(m);
+    TF_Tensor* t = TFE_TensorHandleResolve(retval[0], status);
+    TFE_DeleteTensorHandle(retval[0]);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    float product[4] = {0};
+    EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+    memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+    TF_DeleteTensor(t);
+    EXPECT_EQ(7, product[0]);
+    EXPECT_EQ(10, product[1]);
+    EXPECT_EQ(15, product[2]);
+    EXPECT_EQ(22, product[3]);
+  }
   TFE_ContextRemoveFunction(ctx, "MatMulFunction", status);
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TFE_DeleteContext(ctx);
diff --git a/tensorflow/c/experimental/BUILD b/tensorflow/c/experimental/BUILD
index b66969eb3ff..bc408e637c2 100644
--- a/tensorflow/c/experimental/BUILD
+++ b/tensorflow/c/experimental/BUILD
@@ -1,7 +1,9 @@
 # Description:
 # Experimental C APIs for TensorFlow.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 597182ab016..c71f6f1cca2 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -6,10 +6,9 @@ load(
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 tf_kernel_library(
     name = "bitcast_op",
     prefix = "bitcast_op",
diff --git a/tensorflow/c/tf_attrtype.h b/tensorflow/c/tf_attrtype.h
new file mode 100644
index 00000000000..0c1545db232
--- /dev/null
+++ b/tensorflow/c/tf_attrtype.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_TF_ATTRTYPE_H_
+#define TENSORFLOW_C_TF_ATTRTYPE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TF_AttrType describes the type of the value of an attribute on an operation.
+typedef enum TF_AttrType {
+  TF_ATTR_STRING = 0,
+  TF_ATTR_INT = 1,
+  TF_ATTR_FLOAT = 2,
+  TF_ATTR_BOOL = 3,
+  TF_ATTR_TYPE = 4,
+  TF_ATTR_SHAPE = 5,
+  TF_ATTR_TENSOR = 6,
+  TF_ATTR_PLACEHOLDER = 7,
+  TF_ATTR_FUNC = 8,
+} TF_AttrType;
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_ATTRTYPE_H_
diff --git a/tensorflow/c/tf_datatype.cc b/tensorflow/c/tf_datatype.cc
new file mode 100644
index 00000000000..d2a66d99dac
--- /dev/null
+++ b/tensorflow/c/tf_datatype.cc
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/tf_datatype.h"
+
+#include "tensorflow/core/framework/types.h"
+
+size_t TF_DataTypeSize(TF_DataType dt) {
+  return static_cast<size_t>(
+      tensorflow::DataTypeSize(static_cast<tensorflow::DataType>(dt)));
+}
diff --git a/tensorflow/c/tf_datatype.h b/tensorflow/c/tf_datatype.h
new file mode 100644
index 00000000000..3e6121bf989
--- /dev/null
+++ b/tensorflow/c/tf_datatype.h
@@ -0,0 +1,83 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_DATATYPE_H_
+#define TENSORFLOW_C_TF_DATATYPE_H_
+
+#include <stddef.h>
+
+// Macro to control visibility of exported symbols in the shared library (.so,
+// .dylib, .dll).
+// This duplicates the TF_EXPORT macro definition in
+// tensorflow/core/platform/macros.h in order to keep this .h file independent
+// of any other includes.
+#ifdef SWIG
+#define TF_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TF_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
+// The enum values here are identical to corresponding values in types.proto.
+typedef enum TF_DataType {
+  TF_FLOAT = 1,
+  TF_DOUBLE = 2,
+  TF_INT32 = 3,  // Int32 tensors are always in 'host' memory.
+  TF_UINT8 = 4,
+  TF_INT16 = 5,
+  TF_INT8 = 6,
+  TF_STRING = 7,
+  TF_COMPLEX64 = 8,  // Single-precision complex
+  TF_COMPLEX = 8,    // Old identifier kept for API backwards compatibility
+  TF_INT64 = 9,
+  TF_BOOL = 10,
+  TF_QINT8 = 11,     // Quantized int8
+  TF_QUINT8 = 12,    // Quantized uint8
+  TF_QINT32 = 13,    // Quantized int32
+  TF_BFLOAT16 = 14,  // Float32 truncated to 16 bits.  Only for cast ops.
+  TF_QINT16 = 15,    // Quantized int16
+  TF_QUINT16 = 16,   // Quantized uint16
+  TF_UINT16 = 17,
+  TF_COMPLEX128 = 18,  // Double-precision complex
+  TF_HALF = 19,
+  TF_RESOURCE = 20,
+  TF_VARIANT = 21,
+  TF_UINT32 = 22,
+  TF_UINT64 = 23,
+} TF_DataType;
+
+// TF_DataTypeSize returns the sizeof() for the underlying type corresponding
+// to the given TF_DataType enum value. Returns 0 for variable length types
+// (eg. TF_STRING) or on failure.
+TF_CAPI_EXPORT extern size_t TF_DataTypeSize(TF_DataType dt);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_DATATYPE_H_
diff --git a/tensorflow/c/tf_status.cc b/tensorflow/c/tf_status.cc
new file mode 100644
index 00000000000..a77b18c2ca0
--- /dev/null
+++ b/tensorflow/c/tf_status.cc
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/tf_status.h"
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/core/lib/core/status.h"
+
+using ::tensorflow::Status;
+using ::tensorflow::error::Code;
+
+TF_Status* TF_NewStatus() { return new TF_Status; }
+
+void TF_DeleteStatus(TF_Status* s) { delete s; }
+
+void TF_SetStatus(TF_Status* s, TF_Code code, const char* msg) {
+  if (code == TF_OK) {
+    s->status = Status::OK();
+    return;
+  }
+  s->status = Status(static_cast<Code>(code), tensorflow::StringPiece(msg));
+}
+
+TF_Code TF_GetCode(const TF_Status* s) {
+  return static_cast<TF_Code>(s->status.code());
+}
+
+const char* TF_Message(const TF_Status* s) {
+  return s->status.error_message().c_str();
+}
diff --git a/tensorflow/c/tf_status.h b/tensorflow/c/tf_status.h
new file mode 100644
index 00000000000..937f6bed2d7
--- /dev/null
+++ b/tensorflow/c/tf_status.h
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_TF_STATUS_H_
+#define TENSORFLOW_C_TF_STATUS_H_
+
+#ifdef SWIG
+#define TF_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TF_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_Status TF_Status;
+
+// --------------------------------------------------------------------------
+// TF_Code holds an error code.  The enum values here are identical to
+// corresponding values in error_codes.proto.
+typedef enum TF_Code {
+  TF_OK = 0,
+  TF_CANCELLED = 1,
+  TF_UNKNOWN = 2,
+  TF_INVALID_ARGUMENT = 3,
+  TF_DEADLINE_EXCEEDED = 4,
+  TF_NOT_FOUND = 5,
+  TF_ALREADY_EXISTS = 6,
+  TF_PERMISSION_DENIED = 7,
+  TF_UNAUTHENTICATED = 16,
+  TF_RESOURCE_EXHAUSTED = 8,
+  TF_FAILED_PRECONDITION = 9,
+  TF_ABORTED = 10,
+  TF_OUT_OF_RANGE = 11,
+  TF_UNIMPLEMENTED = 12,
+  TF_INTERNAL = 13,
+  TF_UNAVAILABLE = 14,
+  TF_DATA_LOSS = 15,
+} TF_Code;
+
+// --------------------------------------------------------------------------
+
+// Return a new status object.
+TF_CAPI_EXPORT extern TF_Status* TF_NewStatus(void);
+
+// Delete a previously created status object.
+TF_CAPI_EXPORT extern void TF_DeleteStatus(TF_Status*);
+
+// Record <code, msg> in *s.  Any previous information is lost.
+// A common use is to clear a status: TF_SetStatus(s, TF_OK, "");
+TF_CAPI_EXPORT extern void TF_SetStatus(TF_Status* s, TF_Code code,
+                                        const char* msg);
+
+// Return the code record in *s.
+TF_CAPI_EXPORT extern TF_Code TF_GetCode(const TF_Status* s);
+
+// Return a pointer to the (null-terminated) error message in *s.  The
+// return value points to memory that is only usable until the next
+// mutation to *s.  Always returns an empty string if TF_GetCode(s) is
+// TF_OK.
+TF_CAPI_EXPORT extern const char* TF_Message(const TF_Status* s);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_STATUS_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index bd741249cf2..c5e1262cec3 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -4,10 +4,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 filegroup(
     name = "srcs",
     srcs = [
@@ -638,6 +637,7 @@ cc_library(
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -657,6 +657,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 0605a62b83a..a0353bf17a6 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/cc/framework/cc_op_gen.h"
+
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/cc/framework/cc_op_gen.h"
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -133,7 +135,7 @@ string MakeComment(StringPiece text, StringPiece indent) {
 }
 
 string PrintString(const string& str) {
-  return strings::StrCat("\"", str_util::CEscape(str), "\"");
+  return strings::StrCat("\"", absl::CEscape(str), "\"");
 }
 
 string PrintTensorShape(const TensorShapeProto& shape_proto) {
@@ -191,7 +193,7 @@ string PrintTensor(const TensorProto& tensor_proto) {
       string ret;
       for (int64 i = 0; i < num_elts; ++i) {
         if (i > 0) strings::StrAppend(&ret, " ");
-        strings::StrAppend(&ret, str_util::CEscape(t.flat<string>()(i)));
+        strings::StrAppend(&ret, absl::CEscape(t.flat<string>()(i)));
       }
       return ret;
     }
diff --git a/tensorflow/cc/framework/cc_op_gen_test.cc b/tensorflow/cc/framework/cc_op_gen_test.cc
index 5d9dfd95a55..698ef5be20b 100644
--- a/tensorflow/cc/framework/cc_op_gen_test.cc
+++ b/tensorflow/cc/framework/cc_op_gen_test.cc
@@ -62,12 +62,12 @@ op {
 )";
 
 void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(str_util::StrContains(s, expected))
+  EXPECT_TRUE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
 void ExpectDoesNotHaveSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_FALSE(str_util::StrContains(s, expected))
+  EXPECT_FALSE(absl::StrContains(s, expected))
       << "'" << s << "' contains '" << expected << "'";
 }
 
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index e74ba009083..e93ca8633e6 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -275,7 +275,7 @@ std::unordered_set<string> Scope::Impl::GetColocationConstraints(
   if (GetNodeAttr(attrs, kColocationAttrName, &node_constraints).ok()) {
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
-      if (str_util::ConsumePrefix(&s, kColocationGroupPrefix)) {
+      if (absl::ConsumePrefix(&s, kColocationGroupPrefix)) {
         current_constraints.emplace(s);
       }
     }
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 13bc88f7cd3..8626ed0087e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load(
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 70f362cfeae..dfc7ccd9542 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -308,7 +308,7 @@ Status LoadSavedModel(const SessionOptions& session_options,
   const Status status = LoadSavedModelInternal(session_options, run_options,
                                                export_dir, tags, bundle);
   auto log_and_count = [&](const string& status_str) {
-    LOG(INFO) << "SavedModel load for tags { " << str_util::Join(tags, " ")
+    LOG(INFO) << "SavedModel load for tags { " << absl::StrJoin(tags, " ")
               << " }; Status: " << status_str << ". Took "
               << GetLatencyMicroseconds(start_microseconds) << " microseconds.";
     load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 597e42bb65a..422994ba07c 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -136,7 +136,7 @@ TEST_F(LoaderTest, NoTagMatch) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {"missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       st.error_message(),
       "Could not find meta graph def matching supplied tags: { missing-tag }"))
       << st.error_message();
@@ -152,7 +152,7 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe, "missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       st.error_message(),
       "Could not find meta graph def matching supplied tags: "))
       << st.error_message();
@@ -172,7 +172,7 @@ TEST_F(LoaderTest, SessionCreationFailure) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(str_util::StrContains(st.error_message(), kInvalidTarget))
+  EXPECT_TRUE(absl::StrContains(st.error_message(), kInvalidTarget))
       << st.error_message();
 }
 
diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc
index 2146c8a1974..799856f7fd4 100644
--- a/tensorflow/cc/saved_model/reader.cc
+++ b/tensorflow/cc/saved_model/reader.cc
@@ -51,7 +51,7 @@ Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) {
 Status FindMetaGraphDef(const SavedModel& saved_model_proto,
                         const std::unordered_set<string>& tags,
                         MetaGraphDef* meta_graph_def) {
-  LOG(INFO) << "Reading meta graph with tags { " << str_util::Join(tags, " ")
+  LOG(INFO) << "Reading meta graph with tags { " << absl::StrJoin(tags, " ")
             << " }";
   for (const MetaGraphDef& graph_def : saved_model_proto.meta_graphs()) {
     // Get tags from the graph_def.
@@ -69,7 +69,7 @@ Status FindMetaGraphDef(const SavedModel& saved_model_proto,
       error::Code::NOT_FOUND,
       strings::StrCat(
           "Could not find meta graph def matching supplied tags: { ",
-          str_util::Join(tags, " "),
+          absl::StrJoin(tags, " "),
           " }. To inspect available tag-sets in the SavedModel, please "
           "use the SavedModel CLI: `saved_model_cli`"));
 }
diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc
index 620e9c2eece..e898664c221 100644
--- a/tensorflow/cc/saved_model/reader_test.cc
+++ b/tensorflow/cc/saved_model/reader_test.cc
@@ -64,7 +64,7 @@ TEST_F(ReaderTest, NoTagMatch) {
   Status st = ReadMetaGraphDefFromSavedModel(export_dir, {"missing-tag"},
                                              &meta_graph_def);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       st.error_message(),
       "Could not find meta graph def matching supplied tags: { missing-tag }"))
       << st.error_message();
@@ -78,7 +78,7 @@ TEST_F(ReaderTest, NoTagMatchMultiple) {
   Status st = ReadMetaGraphDefFromSavedModel(
       export_dir, {kSavedModelTagServe, "missing-tag"}, &meta_graph_def);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       st.error_message(),
       "Could not find meta graph def matching supplied tags: "))
       << st.error_message();
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index c173569a095..8e509aeeae8 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load(
diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index 5dbc4f5f6aa..789662f84d0 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -167,8 +167,7 @@ namespace {
 
 bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     int32* dst) {
-  if (tensorflow::str_util::ConsumePrefix(&arg, flag) &&
-      tensorflow::str_util::ConsumePrefix(&arg, "=")) {
+  if (absl::ConsumePrefix(&arg, flag) && absl::ConsumePrefix(&arg, "=")) {
     char extra;
     return (sscanf(arg.data(), "%d%c", dst, &extra) == 1);
   }
@@ -178,7 +177,7 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 
 bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    bool* dst) {
-  if (tensorflow::str_util::ConsumePrefix(&arg, flag)) {
+  if (absl::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *dst = true;
       return true;
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 6362470abef..6daf18b51c4 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -1,7 +1,6 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 4b3726b8475..7d5e889bf7d 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,4 +1,12 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        ":internal",
+        # BEGIN-GOOGLE-INTERNAL
+        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
+        # END-GOOGLE-INTERNAL
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "internal",
@@ -14,15 +22,6 @@ package_group(
     ],
 )
 
-package(
-    default_visibility = [
-        ":internal",
-        # BEGIN-GOOGLE-INTERNAL
-        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
-        # END-GOOGLE-INTERNAL
-    ],
-)
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
@@ -200,6 +199,7 @@ cc_library(
         "//tensorflow/core/kernels:host_constant_op",
         "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:identity_op",
+        "//tensorflow/core/kernels:logging_ops",
         "//tensorflow/core/kernels:no_op",
         "//tensorflow/core/kernels:queue_op",
         "//tensorflow/core/kernels:resource_variable_ops",
@@ -257,10 +257,8 @@ cc_library(
     name = "xla_launch_util",
     srcs = ["xla_launch_util.cc"],
     hdrs = ["xla_launch_util.h"],
-    # TODO(skyewm): remove this once XlaAllocator is factored out.
     visibility = [
         ":internal",
-        "//tensorflow/compiler/xla/python:__pkg__",
     ],
     deps = [
         ":common",
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 91e85970cc0..2d12de53b45 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -244,11 +244,11 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
                                      "resource variable op in called function");
   }
 
-  if (!op_filter_.allow_slow_and_inaccurate_ops && OpIsInaccurate(node)) {
+  if (!op_filter_.allow_inaccurate_ops && OpIsInaccurate(node)) {
     return LogNotCompilableAndReturn(node, "operation with correctness issues");
   }
 
-  if (!op_filter_.allow_slow_and_inaccurate_ops && OpIsSlow(node)) {
+  if (!op_filter_.allow_slow_ops && OpIsSlow(node)) {
     return LogNotCompilableAndReturn(node, "slow operation");
   }
 
@@ -268,8 +268,8 @@ RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
       registration.elide_assert_and_checknumerics;
   op_filter.allow_ops_producing_or_consuming_variant =
       registration.cluster_variant_ops;
-  op_filter.allow_slow_and_inaccurate_ops =
-      registration.cluster_slow_and_inaccurate_ops;
+  op_filter.allow_slow_ops = registration.cluster_slow_ops;
+  op_filter.allow_inaccurate_ops = registration.cluster_inaccurate_ops;
   return op_filter;
 }
 
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 4be8050f7da..a20fc976289 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -97,9 +97,12 @@ class RecursiveCompilabilityChecker {
     // live-out DT_VARIANT values.
     bool allow_ops_producing_or_consuming_variant;
 
-    // Whether ops known to be slow or to have correctness issues should be
-    // auto-clustered.
-    bool allow_slow_and_inaccurate_ops;
+    // Whether ops known to be slow on XLA-GPU should be considered compilable..
+    bool allow_slow_ops;
+
+    // Whether ops known to have numerical accuracy issues should be considered
+    // compilable..
+    bool allow_inaccurate_ops;
   };
 
   RecursiveCompilabilityChecker(const OperationFilter* op_filter,
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 0a92c06ad10..1f23c0880db 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/deadness_analysis.h"
+
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -43,12 +45,12 @@ limitations under the License.
 // ------------------------------------------
 //
 // If we ignore cycles for a moment, computing predicates is fairly
-// straightforward.  We traverse the graph in RPO, mapping each node to a
-// predicate based on the predicates its inputs are mapped to.  For instance a
-// Merge(X, Y) node will be mapped to OR(PredicateFor(X), PredicateFor(Y)).
-// Roughtly speaking, we abstract interpret each node on the "liveness" domain,
-// where values in the domain represent if a tensor carries a dead signal or
-// not.
+// straightforward.  We traverse the graph in a topological order, mapping each
+// node to a predicate based on the predicates its inputs are mapped to.  For
+// instance a Merge(X, Y) node will be mapped to OR(PredicateFor(X),
+// PredicateFor(Y)).  Roughtly speaking, we abstractly interpret each node on
+// the "liveness" domain, where values in the domain represent if a tensor
+// carries a dead signal or not.
 //
 //
 // DEALING WITH CYCLES
@@ -85,22 +87,28 @@ limitations under the License.
 // true on iteration 0, 1, 2 respectively.  This is made more precise in the
 // comment on the AndRecurrence class.
 //
-// The general algorithm that deals with cycles does two RPO (reverse post
-// order) passes over the graph.  On the first pass it assigns a symbolic
-// predicate to merge nodes with backedges.  On the second pass it tries to
-// pattern matche the predicates for the backedges of these merges and infer an
-// AndRecurrence for the merge.
+// The general algorithm that deals with cycles does two topological-order
+// iterations over the graph.  On the first iteration it assigns a symbolic
+// predicate to merge nodes with backedges.  On the second iteration it tries
+// to pattern match the predicates for the backedges of these merges and infer
+// an AndRecurrence for the merge.  In other words, we do a data flow analysis
+// where the data-flow lattice has two elements, Symbolic and NonSymbolic with
+// Symbolic > NonSymbolic.  The lattice has height = 2 so two iterations are
+// sufficient to converge.
 //
-// In other words, we do a pessimistic data flow analysis where the data-flow
-// lattice has two elements, Symbolic and NonSymbolic with Symbolic >
-// NonSymbolic. The lattice has height = 2 so two iterations are sufficient to
-// converge.  We don't do an optimistic data flow analysis to make pattern
-// matching easier: if we assigned the predicate of the initial value to the
-// merge during the first pass, on the second pass the backedge may see a
-// simplified value that would be difficult to pattern match.
+// We first do an optimisitc analysis and, if it does not converge, we then fall
+// back to a pessimistic analysis.  The optimistic analysis assigns the same
+// symbolic predicate to all the merge nodes whose preceding enter nodes have
+// the same frame name on the first iteration.  On the second iteration, if all
+// the merge nodes are pattern matched into the same AndRecurrence predicate
+// instance, the optimistic assignment of the same symbolic predicate is correct
+// and the analyzed result is taken.
 //
-// We still use symbolic predicates for merges for which we can't pattern match
-// on the backedge predicate.  This is conservatively correct.
+// Otherwise, if the optimistic analysis fails to converge, we then obtain the
+// result by falling back to the pessimistic analysis which assigns a unique
+// symbolic predicate to each merge on the first iteration.  We still use
+// symbolic predicates for merges for which we can't pattern match on the
+// backedge predicate.  This is conservatively correct.
 
 namespace tensorflow {
 
@@ -636,6 +644,35 @@ Predicate* PredicateFactory::MakeAndOrImpl(
     negated_ops.insert(negated_op);
   }
 
+  // Simplify {S,&,X} & ~X & ... => S & ...
+  if (is_and) {
+    absl::flat_hash_set<Predicate*> to_remove;
+    std::vector<Predicate*> to_add;
+    for (Predicate* op : simplified_ops) {
+      if (op->kind() == Predicate::Kind::kAndRecurrence) {
+        auto* and_rec = static_cast<AndRecurrencePredicate*>(op);
+        if (negated_ops.contains(and_rec->step())) {
+          // Remove and_rec and ~X and insert S.  Note that checking the
+          // existence of ~X through negated_ops is sufficient since it makes
+          // sure the predicate is in the input operands.  It does not need to
+          // be in simplified_ops if it was already cancelled out.
+          to_remove.insert(and_rec);
+          to_remove.insert(MakeNotPredicate(and_rec->step()));
+          to_add.push_back(and_rec->start());
+        }
+      }
+    }
+    auto it = simplified_ops.begin();
+    while (it != simplified_ops.end()) {
+      if (to_remove.contains(*it)) {
+        it = simplified_ops.erase(it);
+      } else {
+        ++it;
+      }
+    }
+    simplified_ops.insert(simplified_ops.end(), to_add.begin(), to_add.end());
+  }
+
   // If all ops contain the same subop, then factor it out thanks to the
   // distributive property. Such as:
   // - (A & B) | (A & C) | (A & D) => A & (B | C | D)
@@ -699,8 +736,9 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   explicit DeadnessAnalysisImpl(const Graph* graph)
       : graph_(*graph), vlog_(VLOG_IS_ON(2)) {}
 
-  Status Populate();
-  Status PopulateWithReversePostOrder(absl::Span<Node* const> rpo);
+  Status Populate(bool enable_optimistic);
+  Status PopulateFrame(absl::Span<Node* const> topo, bool use_optimistic_mode,
+                       bool* success);
   StatusOr<DeadnessAnalysis::DeadnessPredicate> GetPredicateFor(
       Node* n, int oidx) const override;
   void Print() const override;
@@ -742,16 +780,29 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   }
 
   Status HandleSwitch(Node* n, std::vector<bool>* should_revisit);
-  Status HandleMerge(Node* n, std::vector<bool>* should_revisit);
+  Status HandleMerge(Node* n, std::vector<bool>* should_revisit,
+                     bool use_optimistic_mode);
   Status HandleRecv(Node* n, std::vector<bool>* should_revisit);
   Status HandleGeneric(Node* n, std::vector<bool>* should_revisit);
-  Status HandleNode(Node* n, std::vector<bool>* should_revisit);
+  Status HandleNode(Node* n, std::vector<bool>* should_revisit,
+                    bool use_optimistic_mode = false);
+
+  Status GetFrameBasedTopologicalOrder(std::vector<Node*>* order);
+
+  bool IsRootEnter(const Node* n) const {
+    return IsEnter(n) && control_flow_info_[n->id()].parent_frame->IsSource();
+  }
+
+  bool IsRootExit(const Node* n) const {
+    return IsExit(n) && control_flow_info_[n->id()].parent_frame->IsSource();
+  }
 
   const Graph& graph_;
   absl::flat_hash_map<TensorId, Predicate*, TensorId::Hasher> predicate_map_;
   PredicateFactory predicate_factory_;
   std::vector<ControlFlowInfo> control_flow_info_;
   bool vlog_;
+  absl::flat_hash_map<absl::string_view, Node*> frame_to_merge_node_;
 };
 
 TensorId InputEdgeToTensorId(const Edge* e) {
@@ -914,10 +965,32 @@ Status GetFullFrame(const Node* n, absl::Span<const ControlFlowInfo> cfi_infos,
 
   return Status::OK();
 }
+
+// If the node is inside some frames, get the name of the outermost non-empty
+// frame.  Otherwise, get an empty frame name.
+Status GetRootFrame(const Node* n, absl::Span<const ControlFlowInfo> cfi_infos,
+                    absl::string_view* frame) {
+  int depth = 0;
+  const ControlFlowInfo* cfi_iter = &cfi_infos[n->id()];
+  while (!cfi_iter->parent_frame->IsSource()) {
+    n = cfi_iter->parent_frame;
+    cfi_iter = &cfi_infos[n->id()];
+
+    if (depth++ > 5000) {
+      return errors::Internal(
+          "Frame of depth > 5000:  Probably malformed graph or a bug in "
+          "BuildControlFlowInfo");
+    }
+  }
+
+  *frame = cfi_iter->frame_name;
+  return Status::OK();
+}
 }  // namespace
 
 Status DeadnessAnalysisImpl::HandleMerge(Node* n,
-                                         std::vector<bool>* should_revisit) {
+                                         std::vector<bool>* should_revisit,
+                                         bool use_optimistic_mode) {
   // Merge ignores deadness of its control inputs.  A merge that isn't the
   // target of a backedge has is alive iff any of its data inputs are.  The
   // liveness of a merge that is the target of a backedge can sometimes be
@@ -937,8 +1010,21 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
       // We're visiting this merge for the first time and it has an unvisited
       // backedge.
       Predicate* input_data_pred;
-      TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
-          n, /*output_idx=*/0, /*must_be_true=*/false, &input_data_pred));
+      if (use_optimistic_mode) {
+        // In the optimistic mode, we use the first-seen Merge node per
+        // frame as the representative Merge node.  It is just convenient and
+        // does not affect the result after pattern-matching into the
+        // AndRecurrence form.
+        absl::string_view frame_name = control_flow_info_[n->id()].frame_name;
+        auto insert_result = frame_to_merge_node_.insert({frame_name, n});
+        Node* representative = insert_result.first->second;
+        TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+            representative, /*output_idx=*/0, /*must_be_true=*/false,
+            &input_data_pred));
+      } else {
+        TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+            n, /*output_idx=*/0, /*must_be_true=*/false, &input_data_pred));
+      }
 
       SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred,
                    should_revisit);
@@ -948,7 +1034,7 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
     std::vector<Predicate*> input_preds;
     TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataOnly, &input_preds));
 
-    // We're visiting this merge for the first time and it is a acyclic merge.
+    // We're visiting this merge for the first time and it is an acyclic merge.
     Predicate* input_data_pred =
         predicate_factory_.MakeOrPredicate(input_preds);
     SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred,
@@ -1022,11 +1108,12 @@ Status DeadnessAnalysisImpl::HandleGeneric(Node* n,
 }
 
 Status DeadnessAnalysisImpl::HandleNode(Node* n,
-                                        std::vector<bool>* should_revisit) {
+                                        std::vector<bool>* should_revisit,
+                                        bool use_optimistic_mode) {
   if (n->IsSwitch()) {
     TF_RETURN_IF_ERROR(HandleSwitch(n, should_revisit));
   } else if (n->IsMerge()) {
-    TF_RETURN_IF_ERROR(HandleMerge(n, should_revisit));
+    TF_RETURN_IF_ERROR(HandleMerge(n, should_revisit, use_optimistic_mode));
   } else if (n->IsControlTrigger()) {
     SetPredicate(n, Graph::kControlSlot, predicate_factory_.MakeTrue(),
                  nullptr);
@@ -1040,17 +1127,129 @@ Status DeadnessAnalysisImpl::HandleNode(Node* n,
   return Status::OK();
 }
 
-Status DeadnessAnalysisImpl::Populate() {
-  std::vector<Node*> rpo;
-  GetReversePostOrder(graph_, &rpo, /*stable_comparator=*/NodeComparatorName(),
-                      /*edge_filter=*/[](const Edge& edge) {
-                        return !edge.src()->IsNextIteration();
-                      });
-  return PopulateWithReversePostOrder(rpo);
+// Compute a special topological order for the Graph, where nodes having the
+// same root frame are placed adjacent to each other.  The traversal uses a
+// variant of Kahn's algorithm.  num_ready_inputs is used to keep track of how
+// many inputs of each node are ready; a node is ready to be scheduled if all
+// of its inputs are ready.
+// Ref. to https://en.wikipedia.org/wiki/Topological_sorting for details.
+Status DeadnessAnalysisImpl::GetFrameBasedTopologicalOrder(
+    std::vector<Node*>* order) {
+  absl::flat_hash_map<absl::string_view, size_t> num_enters_for_frame;
+  absl::flat_hash_map<absl::string_view, size_t> num_exits_for_frame;
+  std::vector<size_t> num_ready_inputs(graph_.num_node_ids(), 0);
+  Node* src_node = graph_.source_node();
+  for (const auto* node : graph_.op_nodes()) {
+    const ControlFlowInfo& cf = control_flow_info_[node->id()];
+    if (IsRootEnter(node)) {
+      // Since we care only the root-level frame, full frame names are the same
+      // as frame names.
+      ++num_enters_for_frame[cf.frame_name];
+    } else if (IsRootExit(node)) {
+      ++num_exits_for_frame[cf.frame_name];
+    }
+    // Edge NextIteration->Merge is counted before starting the traveral to
+    // break the backedges.
+    if (IsMerge(node)) {
+      for (const Edge* e : node->in_edges()) {
+        if (IsNextIteration(e->src())) {
+          ++num_ready_inputs[node->id()];
+        }
+      }
+    }
+  }
+
+  // dequeue is used to ensure that the nodes are first-in-first-out.  This
+  // order guarantees that the exits in the ready queue are visited before
+  // nodes that will become ready in the future.
+  std::deque<Node*> ready;
+  ready.push_back(src_node);
+  // ready_enters_per_frame and ready_exits serve as a staging area to buffer
+  // the ready enters/exits before they are moved to the `ready` queue for
+  // controlling the start and end of a processing frame.
+  absl::flat_hash_map<absl::string_view, std::vector<Node*>>
+      ready_enters_per_frame;
+  // Exit nodes shall all be from the same frame, as we process a frame at a
+  // time. So, one vector is enough.
+  std::vector<Node*> ready_exits;
+  while (!ready.empty()) {
+    Node* curr_node = ready.front();
+    ready.pop_front();
+
+    VLOG(4) << "Visiting " << curr_node->name();
+    order->push_back(curr_node);
+
+    for (const Edge* out_edge : curr_node->out_edges()) {
+      Node* out = out_edge->dst();
+      int out_id = out->id();
+      if (IsNextIteration(curr_node) && IsMerge(out)) {
+        // Edge NextIteration->Merge has been counted.
+        continue;
+      }
+      ++num_ready_inputs[out->id()];
+      if (!out->IsOp()) continue;  // Skip Sink/Source nodes.
+      if (num_ready_inputs[out->id()] != out->in_edges().size()) continue;
+
+      absl::string_view frame_name = control_flow_info_[out_id].frame_name;
+      if (IsRootEnter(out)) {
+        ready_enters_per_frame[frame_name].push_back(out);
+      } else if (IsRootExit(out)) {
+        ready_exits.push_back(out);
+      } else {
+        ready.push_back(out);
+      }
+    }
+
+    if (ready.empty()) {
+      // Try moving nodes from ready_enters_per_frame and ready_exits to
+      // `ready`.
+      if (!ready_exits.empty()) {
+        // If there are nodes in ready_exits we must process them before
+        // processing ready_enters_per_frame to make sure all nodes in the
+        // currently processing frame are visited before starting processing
+        // other frames.
+        absl::string_view frame_name =
+            control_flow_info_[ready_exits.front()->id()].frame_name;
+        CHECK_EQ(ready_exits.size(), num_exits_for_frame[frame_name]);
+        ready.insert(ready.end(), ready_exits.begin(), ready_exits.end());
+        ready_exits.clear();
+      } else {
+        // Otherwise, try moving nodes from ready_enters to `ready`.
+        for (auto iter = ready_enters_per_frame.begin();
+             iter != ready_enters_per_frame.end(); ++iter) {
+          absl::string_view frame_name = iter->first;
+          const std::vector<Node*>& ready_enters = iter->second;
+          if (ready_enters.size() == num_enters_for_frame[frame_name]) {
+            ready.insert(ready.end(), ready_enters.begin(), ready_enters.end());
+            ready_enters_per_frame.erase(iter);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (!ready_enters_per_frame.empty() || !ready_exits.empty()) {
+    return errors::InvalidArgument(
+        "Some enters/exits have never been visited in the traversal."
+        " Most probably the input graph is malformed.");
+  }
+  return Status::OK();
 }
 
-Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
-    absl::Span<Node* const> rpo) {
+// We populate the nodes along a special topological order where nodes having
+// the same root frame are placed adjacent to each other.  This grouping enables
+// processing the graph per root frame at a time and guarantees that when a root
+// frame is being processed, nodes in the downstream frames have not yet been
+// processed.  This property is important because we need to process an entire
+// frame to know whether the optimistic mode converges or not.  In other words,
+// nodes in the downstream frames shall not be populated until all of its
+// upstream frames are populated.  In effect, this order enables processing each
+// (nested) tf.while one-by-one, as each (nested) tf.while creates a unique
+// (root) frame.  Note that we don't separate while loops belonging to the same
+// nested while, as there is no clean cut for separating them in the topological
+// order.
+Status DeadnessAnalysisImpl::Populate(bool enable_optimistic) {
   std::vector<string> unreachable_nodes;
   // Compute the loop structure of the graph.
   TF_RETURN_IF_ERROR(
@@ -1069,14 +1268,63 @@ Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
         absl::StrJoin(unreachable_nodes, ", "));
   }
 
+  std::vector<Node*> topo;
+  TF_RETURN_IF_ERROR(GetFrameBasedTopologicalOrder(&topo));
+
+  size_t frame_start = 0;
+  while (frame_start < topo.size()) {
+    // Batching nodes who have the same root frame.
+    absl::string_view cur_frame_name;
+    TF_RETURN_IF_ERROR(
+        GetRootFrame(topo[frame_start], control_flow_info_, &cur_frame_name));
+    size_t frame_end = frame_start;
+    for (size_t i = frame_start + 1; i < topo.size(); ++i) {
+      absl::string_view i_frame_name;
+      TF_RETURN_IF_ERROR(
+          GetRootFrame(topo[i], control_flow_info_, &i_frame_name));
+      if (i_frame_name == cur_frame_name) {
+        frame_end = i;
+      } else {
+        break;
+      }
+    }
+    absl::Span<Node*> sub_topo(topo.data() + frame_start,
+                               /*length=*/frame_end - frame_start + 1);
+    frame_start = frame_end + 1;
+
+    // First, try the optimistic mode.
+    bool success = false;
+    if (enable_optimistic && !cur_frame_name.empty()) {
+      TF_RETURN_IF_ERROR(
+          PopulateFrame(sub_topo, /*use_optimistic_mode=*/true, &success));
+    }
+    if (!success) {
+      // The optimistic mode does not converge.  Let's fall back to the
+      // pessimistic mode.
+      TF_RETURN_IF_ERROR(
+          PopulateFrame(sub_topo, /*use_optimistic_mode=*/false, nullptr));
+    }
+    VLOG(2) << "Done populating frame " << cur_frame_name << " using the "
+            << (success ? "optimistic" : "pessimistic") << " mode.";
+  }
+
+  return Status::OK();
+}
+
+Status DeadnessAnalysisImpl::PopulateFrame(absl::Span<Node* const> topo,
+                                           bool use_optimistic_mode,
+                                           bool* success) {
+  CHECK(use_optimistic_mode && success != nullptr ||
+        !use_optimistic_mode && success == nullptr);
+
   // This an abstract interpretation over the deadness propagation semantics of
   // the graph executor.
   //
-  // We iterate over the graph twice, each time in RPO.  On the first iteration
-  // merge nodes with backedges are mapped to symbolic predicates.  On the
-  // second iteration we use the predicates assigned to the backedges in the
-  // previous iteration to infer a more precise predicate for the backedge merge
-  // nodes and all the nodes that transitively use it.
+  // We iterate over the graph twice, each time in a topological order.  On the
+  // first iteration merge nodes with backedges are mapped to symbolic
+  // predicates.  On the second iteration we use the predicates assigned to the
+  // backedges in the previous iteration to infer a more precise predicate for
+  // the backedge merge nodes and all the nodes that transitively use it.
   //
   // We don't track the output indices for should_revisit.  Instead, putting a
   // node in `should_revisit` denotes that the deadness flowing out from any
@@ -1086,9 +1334,10 @@ Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
   // delta should not change in the second iteration.
   std::vector<bool> should_revisit;
   should_revisit.resize(graph_.num_node_ids());
-  for (Node* n : rpo) {
+  for (Node* n : topo) {
     VLOG(4) << "Visiting " << n->name();
-    TF_RETURN_IF_ERROR(HandleNode(n, /*should_revisit=*/nullptr));
+    TF_RETURN_IF_ERROR(
+        HandleNode(n, /*should_revisit=*/nullptr, use_optimistic_mode));
     if (n->IsNextIteration()) {
       // If this is a backedge for a merge node then remember to reprocess the
       // merge the next time we run.
@@ -1100,11 +1349,11 @@ Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
     }
   }
 
-  for (Node* n : rpo) {
+  for (Node* n : topo) {
     // The nodes added to should_revisit in the previous loop need to be
     // revisited now.  Reprocesing these initial nodes may add *their* consumers
     // to should_revisit, and these newly added nodes will also be processed by
-    // this very same loop.  Since we're traversing the graph in reverse post
+    // this very same loop.  Since we're traversing the graph in topological
     // order (producers before consumers) and HandleNode(n) can only ever add
     // n's consumers to should_revisit, we won't "miss" an addition to
     // should_revisit.
@@ -1114,6 +1363,71 @@ Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
     }
   }
 
+  // Check if the optimistic analysis converges.  Specifically, check whether
+  // all the predicates of the merge nodes in the same frame are the same.  If
+  // yes, report success.  If not, report failure and clear the assigned
+  // predicates.
+  if (use_optimistic_mode) {
+    bool is_converged = true;
+    absl::flat_hash_map<absl::string_view, Predicate*> frame_to_pred;
+    for (Node* n : topo) {
+      if (!n->IsMerge()) {
+        continue;
+      }
+      const Edge* e;
+      TF_RETURN_IF_ERROR(FindUniqueBackedge(n, &e));
+      if (e == nullptr) {
+        // Skip acyclic merge nodes.
+        continue;
+      }
+      Node* merge = n;
+      // Note that here uses frame names instead of root frame names.  In the
+      // case of a nested while loop, each level of while loops can have merges
+      // with different predicate instances, while the merge nodes on the same
+      // level must have the same predicate instances.
+      absl::string_view frame_name = control_flow_info_[merge->id()].frame_name;
+      auto it = predicate_map_.find(TensorId(merge->name(), 0));
+      Predicate* merge_pred = it->second;
+      if (merge_pred->kind() != Predicate::Kind::kAndRecurrence) {
+        is_converged = false;
+        VLOG(2) << "Running the optimistic mode on frame " << frame_name
+                << " does not converge because node " << merge->name()
+                << " cannot be mapped into the AndRecurrence form.";
+        break;
+      }
+
+      auto insert_result = frame_to_pred.insert({frame_name, merge_pred});
+      if (!insert_result.second) {
+        // If we have already seen this frame name, verify the predicate is the
+        // same as the previously seen one's.
+        Predicate* curr_andrec = merge_pred;
+        Predicate* prev_andrec = insert_result.first->second;
+        if (curr_andrec != prev_andrec) {
+          is_converged = false;
+          VLOG(2) << "Running the optimistic mode on frame " << frame_name
+                  << " does not converge. Seeing different Merge predicates: \n"
+                  << curr_andrec->ToString() << " and \n"
+                  << prev_andrec->ToString();
+          break;
+        }
+      }
+    }
+
+    // Clear the assigned predicates if the optimistic mode does not converge.
+    if (!is_converged) {
+      for (Node* n : topo) {
+        for (int oid = 0; oid < n->num_outputs(); ++oid) {
+          predicate_map_.erase(TensorId(n->name(), oid));
+        }
+        predicate_map_.erase(TensorId(n->name(), Graph::kControlSlot));
+      }
+    }
+
+    if (success != nullptr) {
+      *success = is_converged;
+    }
+  }
+
   return Status::OK();
 }
 
@@ -1149,7 +1463,7 @@ DeadnessAnalysis::~DeadnessAnalysis() {}
     const Graph& graph, std::unique_ptr<DeadnessAnalysis>* result) {
   std::unique_ptr<DeadnessAnalysisImpl> analysis(
       new DeadnessAnalysisImpl(&graph));
-  TF_RETURN_IF_ERROR(analysis->Populate());
+  TF_RETURN_IF_ERROR(analysis->Populate(/*enable_optimistic=*/true));
 
   if (VLOG_IS_ON(2)) {
     analysis->Print();
@@ -1170,22 +1484,18 @@ DeadnessAnalysisImpl::PredicateMapAsString() const {
 }
 
 namespace deadness_analysis_internal {
-Status ComputePredicates(const Graph& graph,
-                         PredicateMapTy* out_predicate_map) {
+Status ComputePredicates(const Graph& graph, PredicateMapTy* out_predicate_map,
+                         bool enable_optimistic) {
   DeadnessAnalysisImpl impl(&graph);
-  TF_RETURN_IF_ERROR(impl.Populate());
+  TF_RETURN_IF_ERROR(impl.Populate(enable_optimistic));
   *out_predicate_map = impl.PredicateMapAsString();
   return Status::OK();
 }
 
-Status ComputePredicates(const Graph& graph,
-                         absl::Span<Node* const> reverse_post_order,
-                         PredicateMapTy* out_predicate_map) {
-  DeadnessAnalysisImpl impl(&graph);
-  TF_RETURN_IF_ERROR(impl.PopulateWithReversePostOrder(reverse_post_order));
-  *out_predicate_map = impl.PredicateMapAsString();
-  return Status::OK();
-}
 }  // namespace deadness_analysis_internal
 
+string DeadnessAnalysis::DebugString(DeadnessPredicate predicate) const {
+  return static_cast<Predicate*>(predicate.pred_)->ToString();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/deadness_analysis.h b/tensorflow/compiler/jit/deadness_analysis.h
index 08d8ad011bc..c8527de503d 100644
--- a/tensorflow/compiler/jit/deadness_analysis.h
+++ b/tensorflow/compiler/jit/deadness_analysis.h
@@ -82,6 +82,8 @@ class DeadnessAnalysis {
   virtual void Print() const = 0;
   virtual ~DeadnessAnalysis();
 
+  string DebugString(DeadnessPredicate predicate) const;
+
   // Run the deadness analysis over `graph` and returns an error or a populated
   // instance of DeadnessAnalysis in `result`.
   static Status Run(const Graph& graph,
diff --git a/tensorflow/compiler/jit/deadness_analysis_internal.h b/tensorflow/compiler/jit/deadness_analysis_internal.h
index 354782374ad..b2f0e72bc14 100644
--- a/tensorflow/compiler/jit/deadness_analysis_internal.h
+++ b/tensorflow/compiler/jit/deadness_analysis_internal.h
@@ -25,15 +25,9 @@ namespace deadness_analysis_internal {
 // Returns a map describing the predicate each Tensor was mapped to.  For
 // testing purposes only.
 using PredicateMapTy = absl::flat_hash_map<TensorId, string, TensorId::Hasher>;
-Status ComputePredicates(const Graph& graph, PredicateMapTy* out_predicate_map);
+Status ComputePredicates(const Graph& graph, PredicateMapTy* out_predicate_map,
+                         bool enable_optimistic = true);
 
-// Returns a map describing the predicate each Tensor was mapped to.  For
-// testing purposes only.  Makes deadness analysis visit the graph in the order
-// specified in `reverse_post_order` which must be a valid RPO for the graph
-// minus NextIteration->Merge edges.
-Status ComputePredicates(const Graph& graph,
-                         absl::Span<Node* const> reverse_post_order,
-                         PredicateMapTy* out_predicate_map);
 }  // namespace deadness_analysis_internal
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 3a44eb7db75..fae1e55c6ba 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -638,7 +638,22 @@ TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
   }
   {
     PredicateMapTy predicate_map;
-    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map,
+                                   /*enable_optimistic=*/true));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
+              "{#true,&,*iv0/cond:0}<loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv0)],
+              predicate_map[ControlOutputFor(iv.induction_var)]);
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv1)],
+              predicate_map[ControlOutputFor(iv.induction_var)]);
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              predicate_map[ControlOutputFor(iv.induction_var)]);
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map,
+                                   /*enable_optimistic=*/false));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
               "{#true,&,*iv0/cond:0}<loop>");
@@ -660,16 +675,6 @@ TEST(DeadnessAnalysisTest, LoopInvariantPredicateOnBackedge) {
       CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0);
   FixupSourceAndSinkEdges(root.graph());
 
-  // To make deadness analysis think that dependent_iv is a loop we need an RPO
-  // that visits the merge before the backedge.  This is a legal RPO for
-  // deadness analysis since it ignores NextIteration->Merge edges during RPO.
-  // Right now dependent_iv has an edge from Merge to NextIteration so do the
-  // RPO with this edge in place.  Then remove this edge to get our test case.
-  std::vector<Node*> rpo;
-  GetReversePostOrder(*root.graph(), &rpo, /*stable_comparator=*/{},
-                      /*edge_filter=*/[](const Edge& edge) {
-                        return !edge.src()->IsNextIteration();
-                      });
   TF_ASSERT_OK(root.graph()->UpdateEdge(
       iv.induction_var.node(), 0, dependent_iv.latch.output_true.node(), 0));
 
@@ -677,7 +682,16 @@ TEST(DeadnessAnalysisTest, LoopInvariantPredicateOnBackedge) {
 
   {
     PredicateMapTy predicate_map;
-    TF_ASSERT_OK(ComputePredicates(*root.graph(), rpo, &predicate_map));
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map,
+                                   /*enable_optimistic=*/true));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv.induction_var)],
+              "{#true,&,*iv0/cond:0}<frame>");
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map,
+                                   /*enable_optimistic=*/false));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv.induction_var)],
               "div0/iv:0");
@@ -731,7 +745,34 @@ TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
   }
   {
     PredicateMapTy predicate_map;
-    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map,
+                                   /*enable_optimistic=*/true));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)],
+              "{#true,&,*iv_outer/cond:0}<outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner.induction_var)],
+              "{(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop>");
+
+    // enable_optimistic = true or not should produce the same results because
+    // of fallback.  However, note that the order of iv_inner/cond:0 and
+    // iv_inner/iv:0 is different because the optimistic approach does not
+    // create predicates for all merges and it can change the predicate id and
+    // hence the symbol order.
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv0)],
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(*iv_inner/cond:0 & "
+              "iv_inner/iv:0)}<inner_loop;outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)],
+              predicate_map[ControlOutputFor(dependent_inner_iv0)]);
+    EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
+              predicate_map[ControlOutputFor(dependent_inner_iv0)]);
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map,
+                                   /*enable_optimistic=*/false));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)],
               "{#true,&,*iv_outer/cond:0}<outer_loop>");
@@ -744,15 +785,10 @@ TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
               "{{#true,&,(iv_outer/iv:0 & "
               "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
               "*iv_inner/cond:0)}<inner_loop;outer_loop>");
-
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)],
-              "{{#true,&,(iv_outer/iv:0 & "
-              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
-              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
+              predicate_map[ControlOutputFor(dependent_inner_iv0)]);
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "{{#true,&,(iv_outer/iv:0 & "
-              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
-              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
+              predicate_map[ControlOutputFor(dependent_inner_iv0)]);
   }
 }
 
@@ -817,6 +853,104 @@ TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
   }
 }
 
+TEST(DeadnessAnalysisTest, NestedLoopBodiesWithACapture) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv_outer =
+      CreateInductionVariable(root, "iv_outer", "outer_loop", 0);
+  Output enter_constant_outer_loop = ops::internal::Enter(
+      root.WithOpName("constant_enter_outer_loop"),
+      ops::Const(root.WithOpName("constant"), 5), "outer_loop",
+      ops::internal::Enter::Attrs().IsConstant(true));
+  ops::Switch inner_value(root.WithOpName("outer_is_live"),
+                          enter_constant_outer_loop, iv_outer.loop_cond);
+  InductionVarInfo iv_inner = CreateInductionVariable(
+      root, "iv_inner", "inner_loop", inner_value.output_true);
+
+  DependentInductionVar div0_outer = CreateDependentLoopInvariantValue(
+      root, "div0_outer", "outer_loop", iv_outer.loop_cond, 0);
+  DependentInductionVar div1_outer = CreateDependentLoopInvariantValue(
+      root, "div1_outer", "outer_loop", iv_outer.loop_cond, 0);
+
+  DependentInductionVar div0_inner = CreateDependentLoopInvariantValue(
+      root, "div0_inner", "inner_loop", iv_inner.loop_cond,
+      div0_outer.induction_var);
+  DependentInductionVar div1_inner = CreateDependentLoopInvariantValue(
+      root, "div1_inner", "inner_loop", iv_inner.loop_cond,
+      div1_outer.induction_var);
+
+  Output captured = ops::_Recv(root.WithOpName("captured"), DT_INT32,
+                               "tensor_a", "sender", 0, "receiver");
+  Output capture_enter_outer = ops::internal::Enter(
+      root.WithOpName("capture_enter_outer"), captured, "outer_loop",
+      ops::internal::Enter::Attrs().IsConstant(true));
+  Output capture_enter_inner = ops::internal::Enter(
+      root.WithOpName("capture_enter_inner"), capture_enter_outer, "inner_loop",
+      ops::internal::Enter::Attrs().IsConstant(true));
+  Output mul0 = ops::Mul(root.WithOpName("mul0"), div1_inner.induction_var,
+                         capture_enter_inner);
+  TF_ASSERT_OK(root.graph()->UpdateEdge(
+      mul0.node(), 0, div1_inner.latch.output_true.node(), 0));
+
+  Output add0 = ops::Add(root.WithOpName("add0"), div0_inner.induction_var,
+                         div1_inner.induction_var);
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    std::unique_ptr<DeadnessAnalysis> result;
+    TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        bool has_inputs_with_mismatching_deadness,
+        HasInputsWithMismatchingDeadness(*result, *add0.node()));
+    EXPECT_TRUE(has_inputs_with_mismatching_deadness);
+  }
+}
+
+TEST(DeadnessAnalysisTest, CyclicRecurrence) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "loop", 0);
+  DependentInductionVar div0 =
+      CreateDependentLoopInvariantValue(root, "div0", "loop", iv.loop_cond, 0);
+  DependentInductionVar div1 =
+      CreateDependentLoopInvariantValue(root, "div1", "loop", iv.loop_cond, 0);
+  FixupSourceAndSinkEdges(root.graph());
+  TF_ASSERT_OK(root.graph()->UpdateEdge(div1.induction_var.node(), 0,
+                                        div0.latch.output_true.node(), 0));
+  TF_ASSERT_OK(root.graph()->UpdateEdge(div0.induction_var.node(), 0,
+                                        div1.latch.output_true.node(), 0));
+
+  VLogGraphIfAsked(*root.graph());
+
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map,
+                                   /*enable_optimistic=*/true));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
+              "{#true,&,*iv0/cond:0}<loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(div0.induction_var)],
+              "{#true,&,*iv0/cond:0}<loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(div1.induction_var)],
+              "{#true,&,*iv0/cond:0}<loop>");
+
+    // This tests the rule {S,&,X} & ~X => S.
+    TensorId switch_false_out = {div1.latch.output_false.node()->name(),
+                                 div1.latch.output_false.index()};
+    EXPECT_EQ(predicate_map[switch_false_out], "(#true)");
+  }
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map,
+                                   /*enable_optimistic=*/false));
+
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
+              "{#true,&,*iv0/cond:0}<loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(div0.induction_var)], "div0/iv:0");
+    EXPECT_EQ(predicate_map[ControlOutputFor(div1.induction_var)], "div1/iv:0");
+  }
+}
+
 TEST(DeadnessAnalysisTest, AndRecurrenceNeedsFrameName) {
   Scope root = Scope::NewRootScope().ExitOnError();
   InductionVarInfo iv_0 = CreateInductionVariable(root, "iv_0", "frame_0", 10);
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index b6d97434eb0..982803d501f 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -144,17 +144,8 @@ static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
 
 class Encapsulator {
  public:
-  Encapsulator(string group_attribute, string outside_compilation_attribute,
-               Graph const* graph_in)
-      : group_attribute_(std::move(group_attribute)),
-        outside_compilation_attribute_(
-            std::move(outside_compilation_attribute)),
-        graph_in_(graph_in) {}
-
-  // Find dependencies between subgraphs and outside_compilation clusters that
-  // only manifest via edges between outside_compilation clusters in the outer
-  // (non-compiled) graph.
-  Status FindClusterDependencies();
+  Encapsulator(string group_attribute, Graph const* graph_in)
+      : group_attribute_(std::move(group_attribute)), graph_in_(graph_in) {}
 
   // Find subgraphs marked with 'group_attribute', and build a new
   // subgraph, one for each value of 'group_attribute'.
@@ -176,68 +167,22 @@ class Encapsulator {
 
  private:
   // A subgraph of the input, all marked with a common 'group_attribute'
-  // value. A subgraph may contain multiple `outside_compilation' clusters.
+  // value.
   //
   // In the following simple example, A, B, ..., E are nodes in the original
-  // graph. The group attributes and outside_compilation attributes g and oc are
-  // each shown as either 0 or empty.
+  // graph. The group attributes g are each shown as either 0 or empty.
   //
   //  A  -->  B  -->  C  -->  D  -->  E
   //  g:      g:0     g:0     g:0     g:
-  //  oc:     oc:     oc:0    oc:     oc:
   //
   // The example is rewritten to two graphs; one on the host and one to be
-  // compiled. The host graph is as follows. RAH is a RecvAtHost node receiving
-  // input from the compiled cluster, and SFH is a SendFromHost node sending
-  // input back to the compiled cluster. Dotted edges are control edges. A
-  // 'sequencing' node S is inserted, and both RAH and SFH are connected via S
-  // to E (and in general all nodes that depend on nodes in the compiled
-  // cluster) to ensure that they are not pruned.
+  // compiled. The host graph is as follows.
   //
   //  A  -->  Call  -->  E
-  //                     ^
-  //                     .
-  //           ........> S
-  //       ....          ^
-  //     ..             .
-  //  RAH -->  C  --> SFH
   //
-  // The compiled cluster is as follows. HC is a HostCompute node which is the
-  // source of a channel to the RAH node above and the destination of a channel
-  // from the SFH node above.
+  // The compiled cluster is as follows.
   //
-  //  Arg  --> B  --> HC  --> D --> Retval
-  //
-  // The channels HC/RAH and SFH/HC each transmit multiple tensors, so there is
-  // at most one RAH and SFH in each outside_compilation cluster. This design is
-  // preferred over adding separate Arg/Retval nodes for each transmitted value
-  // because it allows optimizations to the host code that would like to limit
-  // communication between host and device and, e.g., raise only one interrupt
-  // per channel rather than one per transmitted value.
-  //
-  // The shapes of the outputs from the HC node in general cannot be determined
-  // until the shapes of its inputs are known at compile time, since e.g.,
-  // above, the shape of C's outputs aren't known until the shape of its inputs
-  // are known. If the shapes of the HC's outputs can be determined during the
-  // rewrite, they are stored in the node's 'shapes' attr. Otherwise a minimal
-  // graph is stored in the shape_inference_graph attr. This graph can be used
-  // when compiling the HC Op to determined the shape of the SFH inputs given
-  // the shapes of any ancestor RAH outputs. If it can be determined that the
-  // shape of the SFH inputs will not be inferrable even once the shapes of the
-  // RAH outputs are known, an error is returned by the rewriter.
-  //
-  // Once edges between compiled and outside_compilation clusters have been
-  // replaced by send/recv ops, some dependencies may no longer be apparent.
-  // A clustering pass finds all the dependencies between HC nodes that are only
-  // present as a result of edges between nodes in outside_compilation clusters.
-  // Suppose there is a path from outside_compilation cluster C in subgraph S
-  // to outside_compilation cluster D in subgraph T. If S != T then a control
-  // edge is added from the call node for S to the call node for T, which
-  // ensures that C will execute before D because S executes before T. If S==T
-  // then a control dependency is added between the HC nodes for C and D in S,
-  // and the HC node for C is added to an 'ancestors' attr in the HC node for D
-  // so that during compilation of the HC node for D, an XLA control dependency
-  // can be added to ensure C's SendToHost executes before D's RecvFromHost.
+  //  Arg  --> B  --> C  --> D --> Retval
   class Subgraph {
    public:
     // Creates a graph to build the subgraph in, if it doesn't already exist,
@@ -262,17 +207,6 @@ class Encapsulator {
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
-    // Adds _RecvAtHost and _SendFromHost nodes, where needed, to graph_out.
-    Status AddOutsideCompilationHostIONodes(
-        const string& group_attribute, const string& subgraph_name,
-        const string& outside_compilation_attribute,
-        const std::unordered_map<const Node*, Node*>& node_images,
-        Graph* graph_out);
-
-    // Returns the names of all the outside_compilation subgraphs in this
-    // Subgraph.
-    void GetOutsideCompilationSubgraphNames(std::vector<string>* names) const;
-
     // Returns the Node that the inputs and outputs of the function should be
     // wired up to.
     Node* GetCallNode() const;
@@ -283,24 +217,6 @@ class Encapsulator {
     // Returns the index of the result that the src of edge should connect to.
     int GetResultIndexForEdge(const Edge* edge) const;
 
-    // Returns the RecvAtHost node for an outside_compilation subgraph.
-    Node* GetRecvAtHostNode(
-        const string& outside_compilation_subgraph_name) const;
-
-    // Returns the output slot for the RecvAtHost node that corresponds to the
-    // source of edge in an outside_compilation subgraph.
-    int GetRecvAtHostSlot(const string& outside_compilation_subgraph_name,
-                          const Edge* edge) const;
-
-    // Returns the SendFromHost node for an outside_compilation subgraph.
-    Node* GetSendFromHostNode(
-        const string& outside_compilation_subgraph_name) const;
-
-    // Returns the input slot for the SendFromHost node that corresponds to the
-    // destination of edge in an outside_compilation subgraph.
-    int GetSendFromHostSlot(const string& outside_compilation_subgraph_name,
-                            const Edge* edge) const;
-
     // Creates an _Arg node for the src node of edge, and add its index to
     // args_by_src_, if none exists yet. Also adds its index to args_by_dst_,
     // and adds the edge within the subgraph from the _Arg node to the image of
@@ -323,37 +239,6 @@ class Encapsulator {
         const Edge* edge,
         const std::unordered_map<const Node*, Node*>& node_images);
 
-    // Creates an outside_compilation subgraph for outside_compilation_id if
-    // none exists yet. Creates an entry for the src node of edge in the list of
-    // inputs for the outside_compilation subgraph, if none exists yet.
-    void RecordOutsideCompilationInputOrControl(
-        const string& outside_compilation_id, const Edge* edge);
-
-    // Creates an outside_compilation subgraph for outside_compilation_id if
-    // none exists yet. Creates an entry for the src node of edge in the list of
-    // outputs by src for the outside_compilation subgraph, if none exists
-    // yet. Creates an entry for the dst node of edge in the list of outputs by
-    // dst for the outside_compilation subgraph.
-    void RecordOutsideCompilationOutputOrControl(
-        const string& outside_compilation_id, const Edge* edge);
-
-    // Records the fact that there is a path from a node in outside_compilation
-    // cluster ancestor to node in cluster successor that does not go through
-    // the subgraph.
-    void RecordOutsideCompilationDependency(const string& successor,
-                                            const string& ancestor);
-
-    // Returns the mapping from outside_compilation cluster C to the set of
-    // outside_compilation clusters that have a path to C entirely outside
-    // compiled subgraphs.
-    const std::unordered_map<string, std::unordered_set<string>>
-    OutsideCompilationAncestorMap() const;
-
-    // Adds the HostCompute nodes for each outside_compilation subgraph.
-    Status AddHostComputes(
-        const string& subgraph_name,
-        const std::unordered_map<const Node*, Node*>& node_images);
-
     // Creates the sequencer node if it doesn't exist, adding it to graph_out.
     Status MakeSequencingNode(const string& subgraph_name, Graph* graph_out);
 
@@ -361,102 +246,9 @@ class Encapsulator {
     // the call node.
     void ConnectSequencerToCallNode(Graph* graph_out);
 
-    Status AddShapeInferenceInfo(
-        const string& subgraph_name,
-        const string& outside_compilation_subgraph_name,
-        const std::vector<TensorShapeProto>& shapes, Graph* inference_graph,
-        FunctionLibraryDefinition* library);
-
     Status ReplaceFunctionDef(FunctionLibraryDefinition* library);
 
    private:
-    struct OutsideCompilationSubgraph {
-      // Map from source (producer node/slot) tensors in the original graph to
-      // input index (slot number in the HostCompute/RecvAtHost nodes that will
-      // be created) for the outside_compilation subgraph.
-      std::unordered_map<OutputTensor, int, OutputTensor::Hash> inputs;
-
-      // Set of nodes in the original graph that are the source of control edges
-      // that cross from the containing compiled subgraph into the
-      // outside_compilation subgraph. These are recorded by
-      // RecordOutsideCompilationInputOrControl while walking all the subgraph
-      // edges, and lifted control edges within the subgraph are added by
-      // AddSendsToOutsideCompilation once the _HostCompute node has been
-      // created. The matching control edge from _RecvAtHost to the
-      // destination is added by CopyEdgeToOutputGraph.
-      std::unordered_set<const Node*> control_inputs;
-
-      // Maps from source (producer node/slot) and destination (consumer
-      // node/slot) tensors in the original graph to output index (slot number
-      // in the SendFromHost/HostCompute nodes that will be created) for the
-      // outside_compilation subgraph.
-      struct ArgNumAndType {
-        int index;
-        DataType dtype;
-
-        ArgNumAndType(int i, DataType t) : index(i), dtype(t) {}
-      };
-      std::unordered_map<OutputTensor, ArgNumAndType, OutputTensor::Hash>
-          outputs_by_src;
-      std::unordered_map<InputTensor, int, InputTensor::Hash> outputs_by_dst;
-
-      // Set of nodes in the original graph that are the destination of control
-      // edges that cross from the outside_compilation subgraph into the
-      // containing compiled subgraph. These are recorded by
-      // RecordOutsideCompilationOutputOrControl while walking all the subgraph
-      // edges, and lifted control edges within the subgraph are added by
-      // AddRecvsFromToOutsideCompilation once the _HostCompute node has been
-      // created. The matching control edge from the source to _SendFromHost to
-      // the destination is added by CopyEdgeToOutputGraph.
-      std::unordered_set<const Node*> control_outputs;
-
-      // Name of the _HostCompute node in the subgraph.
-      string host_compute_name;
-
-      // _RecvAtHost node in the output graph. Not owned.
-      Node* recv_at_host = nullptr;
-
-      // _SendFromHost node in the output graph. Not owned.
-      Node* send_from_host = nullptr;
-    };
-
-    // Creates an outside_compilation subgraph for outside_compilation_id if
-    // none exists yet. Returns the (possible newly created) subgraph for
-    // outside_compilation_id.
-    OutsideCompilationSubgraph* LookupOrCreateOutsideCompilationSubgraph(
-        const string& outside_compilation_id);
-
-    // Builds a placeholder node used to provide the key input to a RecvAtHost
-    // or SendFromHost node. This placeholder node will be removed by a later
-    // pass.
-    Status AddHostComputeKeyPlaceholder(OutsideCompilationSubgraph* oc_subgraph,
-                                        Graph* graph_out);
-
-    // Get the set of outside_compilation clusters and the dependency edges
-    // between them.
-    void GetActiveClusterDependencyGraph(
-        std::unordered_set<string>* clusters,
-        std::unordered_set<string>* has_successor,
-        std::unordered_map<string, std::unordered_set<string>>* ancestors_map);
-
-    // Builds a _RecvAtHost node producing all the inputs of an
-    // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host.
-    Status AddRecvAtHostNode(const string& group_attribute,
-                             const string& subgraph_name,
-                             const string& outside_compilation_attribute,
-                             const string& oc_subgraph_name,
-                             OutsideCompilationSubgraph* oc_subgraph,
-                             Graph* graph_out);
-
-    // Builds a _SendFromHost node consuming all the outputs of an
-    // outside_compilation subgraph and stores it in oc_subgraph.send_from_host.
-    Status AddSendFromHostNode(
-        const std::unordered_map<const Node*, Node*>& node_images,
-        const string& group_attribute, const string& subgraph_name,
-        const string& outside_compilation_attribute,
-        const string& oc_subgraph_name, OutsideCompilationSubgraph* oc_subgraph,
-        Graph* graph_out);
-
     // The subgraph extracted from the input graph, suitable for being turned
     // into a FunctionDef. Inputs are fed by _Arg nodes, and outputs are
     // returned by _Retval nodes.
@@ -498,31 +290,13 @@ class Encapsulator {
     // removed from the graph.
     absl::flat_hash_set<string> control_output_nodes_;
 
-    // The outside_compilation clusters in this subgraph.
-    std::unordered_map<string, OutsideCompilationSubgraph>
-        outside_compilation_subgraphs_;
-    // For each outside_compilation cluster C, the outside_compilation clusters
-    // that have a path to C outside the compiled graph.
-    std::unordered_map<string, std::unordered_set<string>>
-        outside_compilation_ancestors_;
-    // For each outside_compilation cluster C, the outside_compilation clusters
-    // that have a path from C outside the compiled graph.
-    std::unordered_map<string, std::unordered_set<string>>
-        outside_compilation_successors_;
-
-    // NoOp node in the output graph that is sequenced after the call node and
-    // used to prevent host-side outside_compilation sends and recvs from being
-    // pruned.
+    // NoOp node in the output graph that is sequenced after the call node.
     Node* sequencer_ = nullptr;
   };
 
-  // Returns the key attribute and outside_compilation attribute associated
-  // with a node in attr, and outside_compilation_attr, respectively. Sets
-  // either result to the empty string if the respective attribute is not
-  // found. Returns error status if there is an outside_compilation attribute
-  // and no key attribute,
-  Status GetFunctionNameAttr(Node const* node, string* attr,
-                             string* outside_compilation_attr) const;
+  // Returns the key attribute associated with a node in attr. Sets either
+  // result to the empty string if the respective attribute is not found.
+  Status GetFunctionNameAttr(Node const* node, string* attr) const;
 
   // Copies edges local to a subgraph. Adds _Arg and _Retval nodes to
   // subgraphs for data edges that cross subgraph boundaries.
@@ -530,8 +304,7 @@ class Encapsulator {
       const std::unordered_map<const Node*, Node*>& node_images,
       std::vector<std::pair<const Node*, Node*>>* src_arg_pairs);
 
-  // Copies all marked nodes to a subgraph. Does nothing for unmarked nodes,
-  // or nodes marked outside_compilation.
+  // Copies all marked nodes to a subgraph. Does nothing for unmarked nodes.
   Status CopySubgraphNodes(std::unordered_map<const Node*, Node*>* node_images);
 
   // Copies all nodes that aren't in a compiled subgraph to the output graph.
@@ -543,92 +316,50 @@ class Encapsulator {
       const std::unordered_map<const Node*, Node*>& node_images,
       Graph* graph_out);
 
-  // Adds _RecvAtHost and _SendFromHost nodes, where needed, for all
-  // outside_compilation subgraphs.
-  Status AddOutsideCompilationHostIONodes(
-      const std::unordered_map<const Node*, Node*>& node_images,
-      Graph* graph_out);
-
   // Finds the image of an edge source in the output graph. If the edge crosses
   // a subgraph boundary it is the output of a call node, otherwise it is a node
   // in the output graph.
   Status FindOutputImageOfEdgeSrc(
-      const string& src_func_id, const string& src_outside_compilation_id,
-      const string& dst_func_id, const string& dst_outside_compilation_id,
+      const string& src_func_id, const string& dst_func_id,
       const std::unordered_map<const Node*, Node*>& node_images,
       const Node* original_src_node, Node** src_image);
 
   // Finds an edge source slot in the output graph. If the edge crosses a
-  // subgraph boundary it is a slot on the output of a call node or a
-  // _RecvAtHost node, otherwise it is a slot on a node in the output graph.
+  // subgraph boundary it is a slot on the output of a call node, otherwise it
+  // is a slot on a node in the output graph.
   int FindOutputSlotOfEdgeSrc(const string& src_func_id,
-                              const string& src_outside_compilation_id,
                               const string& dst_func_id,
-                              const string& dst_outside_compilation_id,
                               const Edge* edge);
 
   // Finds the image of an edge destination in the output graph. If the edge
-  // crosses a subgraph boundary it is the input of a call node or a
-  // _SendFromHost node, otherwise it is a node in the output graph.
+  // crosses a subgraph boundary it is the input of a call node, otherwise it is
+  // a node in the output graph.
   Status FindOutputImageOfEdgeDst(
-      const string& src_func_id, const string& src_outside_compilation_id,
-      const string& dst_func_id, const string& dst_outside_compilation_id,
+      const string& src_func_id, const string& dst_func_id,
       const std::unordered_map<const Node*, Node*>& node_images,
       const Node* original_dst_node, Node** dst_image);
 
   // Finds an edge destination slot in the output graph. If the edge crosses a
-  // subgraph boundary it is a slot on the input of a call node or a
-  // _SendFromHost node, otherwise it is a slot on a node in the output graph.
+  // subgraph boundary it is a slot on the input of a call node, otherwise it is
+  // a slot on a node in the output graph.
   int FindOutputSlotOfEdgeDst(const string& src_func_id,
-                              const string& src_outside_compilation_id,
                               const string& dst_func_id,
-                              const string& dst_outside_compilation_id,
                               const Edge* edge);
 
   // Copies a single edge to the output graph. The edge is either entirely
   // within the output graph, or crosses into or out of a compiled subgraph.
   Status CopyEdgeToOutputGraph(
-      const Edge* edge, const string& src_func_id,
-      const string& src_outside_compilation_id, const string& dst_func_id,
-      const string& dst_outside_compilation_id,
+      const Edge* edge, const string& src_func_id, const string& dst_func_id,
       const std::unordered_map<const Node*, Node*>& node_images,
       Graph* graph_out,
       std::unordered_set<std::pair<OutputTensor, InputTensor>,
                          OutputInputTensorPairHasher>* edges_added);
 
-  // Adds control dependencies between subgraph call nodes that have
-  // dependencies via outside_compilation edges.
-  Status AddCallNodeDependencies(Graph* graph_out);
-
   // Adds all edges to the output graph.
   Status AddEdgesToOutputGraph(
       const std::unordered_map<const Node*, Node*>& node_images,
       Graph* graph_out);
 
-  // Constructs a minimal shape inference graph that can be used to determine
-  // the shape of send_node at the time that the subgraph is compiled.
-  // recv_at_host_nodes contains the names of all the recv_at_host nodes that
-  // send_node might depend on. These recv_at_host nodes have shapes that are
-  // not known during the rewrite pass, but will be known at compile time.
-  //
-  // If the shapes of all the inputs to send_node can be determined during the
-  // rewrite pass, on exit graphdef_out is empty and the shapes are returned in
-  // static_shape_out. Otherwise graphdef_out contains a graph that can be used
-  // for shape inference at compile time, where all the source nodes of the
-  // graph are either constants with known shapes, or nodes named in
-  // recv_at_host_nodes.
-  //
-  // A non-OK status is returned if neither of the above conditions can be
-  // satisfied, e.g., because send_node depends on a node that doesn't have a
-  // registered shape inference function.
-  Status DoStaticShapeInferenceForOutsideCompilationSend(
-      const Graph& graph_in, const BackEdgeHelper& back_edge_helper,
-      const ShapeRefiner& shape_refiner,
-      const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
-      FunctionLibraryDefinition* library,
-      std::vector<TensorShapeProto>* static_shape_out,
-      std::unique_ptr<Graph>* graph_out);
-
   // Makes a copy of graph containing only nodes that are ancestors of at least
   // one node in send_from_host_nodes and store it in pruned_graph. On exit
   // nodes_images contains a mapping from nodes in graph to nodes in
@@ -639,35 +370,10 @@ class Encapsulator {
       std::unordered_map<const Node*, Node*>* node_images,
       FunctionLibraryDefinition* library);
 
-  // Makes a copy of graph containing only nodes that are ancestors of a
-  // send_from_host node in an outside_compilation subgraph, and store it in
-  // pruned_graph. Also perform shape inference on the pruned graph, using
-  // shape_refiner. On exit node_images contains a mapping from nodes in graph
-  // to nodes in pruned_graph.
-  Status MakeGraphForOutsideCompilationSends(
-      const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-      BackEdgeHelper* back_edge_helper, ShapeRefiner* shape_refiner,
-      std::unordered_map<const Node*, Node*>* node_images,
-      FunctionLibraryDefinition* library);
-
-  // Performs static shape inference, as far as possible, for the send_from_host
-  // nodes in each outside_compilation subgraph. Where it is not possible to
-  // determine the shape statically, stores a serialized GraphDef in the
-  // HostCompute 'shape_inference_graph' attr, to be used at compile time for
-  // final inference. If the shapes are known statically they are stored in the
-  // HostCompute 'shapes' attr.
-  Status GetShapeInfoForOutsideCompilationSends(
-      Graph* graph_out, FunctionLibraryDefinition* library);
-
   const string group_attribute_;
-  const string outside_compilation_attribute_;
   const Graph* graph_in_;
 
   std::unordered_map<string, Subgraph> subgraphs_;
-  // For each subgraph S the subgraphs S' such that there is a path in some
-  // outside_compilation cluster C in S to some outside_compilation cluster C'
-  // in S', that goes only through the uncompiled graph.
-  std::unordered_map<string, std::unordered_set<string>> subgraph_ancestors_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator);
 };
@@ -733,30 +439,6 @@ int Encapsulator::Subgraph::GetResultIndexForEdge(const Edge* edge) const {
   return results_.at(OutputTensor(edge->src(), edge->src_output()));
 }
 
-Node* Encapsulator::Subgraph::GetRecvAtHostNode(
-    const string& outside_compilation_subgraph_name) const {
-  return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name)
-      .recv_at_host;
-}
-
-int Encapsulator::Subgraph::GetRecvAtHostSlot(
-    const string& outside_compilation_subgraph_name, const Edge* edge) const {
-  return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name)
-      .inputs.at(OutputTensor(edge->src(), edge->src_output()));
-}
-
-Node* Encapsulator::Subgraph::GetSendFromHostNode(
-    const string& outside_compilation_subgraph_name) const {
-  return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name)
-      .send_from_host;
-}
-
-int Encapsulator::Subgraph::GetSendFromHostSlot(
-    const string& outside_compilation_subgraph_name, const Edge* edge) const {
-  return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name)
-      .outputs_by_dst.at(InputTensor(edge->dst(), edge->dst_input()));
-}
-
 Node* Encapsulator::Subgraph::MakeNodeImage(const Graph* graph_in, Node* node) {
   if (!graph_) {
     graph_.reset(new Graph(graph_in->op_registry()));
@@ -854,217 +536,6 @@ Status Encapsulator::Subgraph::RecordResult(
   return Status::OK();
 }
 
-Encapsulator::Subgraph::OutsideCompilationSubgraph*
-Encapsulator::Subgraph::LookupOrCreateOutsideCompilationSubgraph(
-    const string& outside_compilation_id) {
-  auto iter = outside_compilation_subgraphs_
-                  .emplace(outside_compilation_id, OutsideCompilationSubgraph())
-                  .first;
-  OutsideCompilationSubgraph* outside_subgraph = &iter->second;
-  return outside_subgraph;
-}
-
-void Encapsulator::Subgraph::RecordOutsideCompilationInputOrControl(
-    const string& outside_compilation_id, const Edge* edge) {
-  OutsideCompilationSubgraph* outside_subgraph =
-      LookupOrCreateOutsideCompilationSubgraph(outside_compilation_id);
-  if (edge->IsControlEdge()) {
-    outside_subgraph->control_inputs.insert(edge->src());
-  } else {
-    int input_index = outside_subgraph->inputs.size();
-    outside_subgraph->inputs.emplace(
-        OutputTensor(edge->src(), edge->src_output()), input_index);
-  }
-}
-
-void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
-    const string& outside_compilation_id, const Edge* edge) {
-  OutsideCompilationSubgraph* outside_subgraph =
-      LookupOrCreateOutsideCompilationSubgraph(outside_compilation_id);
-  if (edge->IsControlEdge()) {
-    outside_subgraph->control_outputs.insert(edge->dst());
-  } else {
-    DataType dtype = edge->dst()->input_type(edge->dst_input());
-    auto output_iter =
-        outside_subgraph->outputs_by_src
-            .emplace(OutputTensor(edge->src(), edge->src_output()),
-                     OutsideCompilationSubgraph::ArgNumAndType(
-                         outside_subgraph->outputs_by_src.size(), dtype))
-            .first;
-    const int output_index = output_iter->second.index;
-    outside_subgraph
-        ->outputs_by_dst[InputTensor(edge->dst(), edge->dst_input())] =
-        output_index;
-  }
-}
-
-void Encapsulator::Subgraph::RecordOutsideCompilationDependency(
-    const string& successor, const string& ancestor) {
-  outside_compilation_ancestors_[successor].insert(ancestor);
-  outside_compilation_successors_[ancestor].insert(successor);
-}
-
-const std::unordered_map<string, std::unordered_set<string>>
-Encapsulator::Subgraph::OutsideCompilationAncestorMap() const {
-  return outside_compilation_ancestors_;
-}
-
-void Encapsulator::Subgraph::GetActiveClusterDependencyGraph(
-    std::unordered_set<string>* clusters,
-    std::unordered_set<string>* has_successor,
-    std::unordered_map<string, std::unordered_set<string>>* ancestors_map) {
-  // During initial clustering the ancestor and successor datastructures may
-  // have been built including oc_cluster names that never turned into subgraphs
-  // because they had no edges into or out of the compiled cluster. Remove them
-  // before proceeding to simplify the logic. Get the set of clusters that was
-  // actually added, then remove references to the others.
-  for (const auto& oc_subgraph : outside_compilation_subgraphs_) {
-    clusters->insert(oc_subgraph.first);
-  }
-  for (const auto& cluster : outside_compilation_successors_) {
-    if (clusters->find(cluster.first) != clusters->end()) {
-      for (const auto& successor : cluster.second) {
-        if (clusters->find(successor) != clusters->end()) {
-          has_successor->insert(cluster.first);
-          break;
-        }
-      }
-    }
-  }
-  for (const auto& cluster : outside_compilation_ancestors_) {
-    if (clusters->find(cluster.first) != clusters->end()) {
-      std::unordered_set<string>& ancestors = (*ancestors_map)[cluster.first];
-      for (const auto& ancestor : cluster.second) {
-        if (clusters->find(ancestor) != clusters->end()) {
-          ancestors.insert(ancestor);
-        }
-      }
-    }
-  }
-}
-
-Status Encapsulator::Subgraph::AddHostComputes(
-    const string& subgraph_name,
-    const std::unordered_map<const Node*, Node*>& node_images) {
-  // Get the set of outside_compilation clusters and the dependency edges
-  // between them.
-  std::unordered_set<string> clusters;
-  std::unordered_set<string> has_successor;
-  std::unordered_map<string, std::unordered_set<string>> ancestors_map;
-  GetActiveClusterDependencyGraph(&clusters, &has_successor, &ancestors_map);
-  // Topologically sort the outside_compilation clusters according to their
-  // dependency relation.
-  std::vector<string> sorted_clusters;
-  TopologicalClusterSort(clusters, has_successor, ancestors_map,
-                         &sorted_clusters);
-
-  // The host compute nodes added for each outside_compilation_cluster;
-  std::unordered_map<string, Node*> host_compute_node;
-  for (const string& oc_subgraph_name : sorted_clusters) {
-    OutsideCompilationSubgraph& oc_subgraph =
-        outside_compilation_subgraphs_[oc_subgraph_name];
-    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
-        !oc_subgraph.outputs_by_src.empty() ||
-        !oc_subgraph.control_outputs.empty()) {
-      // Build a _HostCompute node.
-      std::vector<NodeDefBuilder::NodeOut> inputs(oc_subgraph.inputs.size());
-      std::vector<DataType> input_dtypes(oc_subgraph.inputs.size(), DT_INVALID);
-      std::vector<DataType> output_dtypes(oc_subgraph.outputs_by_src.size(),
-                                          DT_INVALID);
-
-      for (const auto& input_src : oc_subgraph.inputs) {
-        const Node* src_node = input_src.first.node;
-        Node* src_image = node_images.at(src_node);
-        int src_slot = input_src.first.index;
-        int input_index = input_src.second;
-
-        DataType dtype = src_node->output_type(src_slot);
-        inputs[input_index].Reset(src_image->name(), src_slot, dtype);
-        input_dtypes[input_index] = dtype;
-      }
-      for (const auto& output : oc_subgraph.outputs_by_src) {
-        DataType dtype = output.second.dtype;
-        int output_index = output.second.index;
-        output_dtypes[output_index] = dtype;
-      }
-
-      std::vector<string> host_compute_ancestors;
-      const auto iter = ancestors_map.find(oc_subgraph_name);
-      if (iter != ancestors_map.end()) {
-        for (const string& ancestor_cluster : iter->second) {
-          host_compute_ancestors.push_back(
-              outside_compilation_subgraphs_[ancestor_cluster]
-                  .host_compute_name);
-        }
-      }
-
-      NodeDef host_compute_def;
-      // TODO(shikharagarwal): What source node should we use for errors?
-      NodeDefBuilder builder(absl::StrCat("outside_compilation_",
-                                          oc_subgraph_name, "_host_compute"),
-                             kHostComputeOp);
-      builder.Input(inputs);
-      builder.Attr("Tinputs", input_dtypes);
-      builder.Attr("Toutputs", output_dtypes);
-      builder.Attr("ancestors", host_compute_ancestors);
-      builder.Attr("key", absl::StrCat("host_compute_channel_", subgraph_name,
-                                       "_", oc_subgraph_name));
-      builder.Attr("_outside_compilation_subgraph", oc_subgraph_name);
-      Status s = builder.Finalize(&host_compute_def);
-      if (!s.ok()) return s;
-
-      Node* host_compute = graph_->AddNode(host_compute_def, &s);
-      if (!s.ok()) return s;
-      host_compute_node[host_compute->name()] = host_compute;
-      oc_subgraph.host_compute_name = host_compute->name();
-
-      // Connect the _HostCompute node to its producers in the subgraph.
-      for (auto& input_src : oc_subgraph.inputs) {
-        const Node* src_node = input_src.first.node;
-        Node* src_image = node_images.at(src_node);
-        int src_slot = input_src.first.index;
-        int input_index = input_src.second;
-        graph_->AddEdge(src_image, src_slot, host_compute, input_index);
-      }
-
-      // Connect the _HostCompute node to its control edge producers in the
-      // subgraph.
-      for (const auto& src_node : oc_subgraph.control_inputs) {
-        Node* src_image = node_images.at(src_node);
-        graph_->AddControlEdge(src_image, host_compute,
-                               /* allow_duplicates= */ true);
-      }
-
-      // Connect the _HostCompute node to its ancestor host compute nodes.
-      for (const auto& ancestor_name : host_compute_ancestors) {
-        Node* ancestor = host_compute_node[ancestor_name];
-        graph_->AddControlEdge(ancestor, host_compute,
-                               /* allow_duplicates= */ true);
-      }
-
-      // Connect the consumers in the subgraph to the _HostCompute node.
-      for (const auto& output : oc_subgraph.outputs_by_dst) {
-        const Node* dst_node = output.first.node;
-        Node* dst_image = node_images.at(dst_node);
-        int dst_slot = output.first.index;
-        int output_index = output.second;
-
-        graph_->AddEdge(host_compute, output_index, dst_image, dst_slot);
-      }
-
-      // Connect the control edge consumers in the subgraph to the _HostCompute
-      // node.
-      for (const auto& dst_node : oc_subgraph.control_outputs) {
-        Node* dst_image = node_images.at(dst_node);
-        graph_->AddControlEdge(host_compute, dst_image,
-                               /* allow_duplicates= */ true);
-      }
-    }
-  }
-
-  return Status::OK();
-}
-
 Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
                                                   Graph* graph_out) {
   if (sequencer_ == nullptr) {
@@ -1167,48 +638,6 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
   return Status::OK();
 }
 
-Status Encapsulator::Subgraph::AddShapeInferenceInfo(
-    const string& subgraph_name,
-    const string& outside_compilation_subgraph_name,
-    const std::vector<TensorShapeProto>& shapes, Graph* inference_graph,
-    FunctionLibraryDefinition* library) {
-  OutsideCompilationSubgraph& oc_subgraph =
-      outside_compilation_subgraphs_.at(outside_compilation_subgraph_name);
-
-  Node* host_compute = nullptr;
-  for (Node* n : graph_->nodes()) {
-    if (n->name() == oc_subgraph.host_compute_name) {
-      host_compute = n;
-      break;
-    }
-  }
-  if (host_compute == nullptr) {
-    return errors::InvalidArgument(
-        "After rewriting subgraph ", outside_compilation_subgraph_name,
-        " there is no HostCompute Op for outside compilation subgraph ",
-        oc_subgraph.host_compute_name);
-  }
-
-  if (inference_graph == nullptr) {
-    host_compute->AddAttr("shape_inference_graph", "");
-    host_compute->AddAttr("shapes", shapes);
-  } else {
-    string inference_graph_name =
-        absl::StrCat("_outside_compilation_shape_inference_", subgraph_name,
-                     "_", outside_compilation_subgraph_name);
-    FunctionDef fdef;
-    TF_RETURN_IF_ERROR(
-        GraphToFunctionDef(*inference_graph, inference_graph_name, &fdef));
-    host_compute->AddAttr("shape_inference_graph", inference_graph_name);
-    host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
-    // TODO(sibyl-Aix6ihai): Understand why there are multiple calls to Encapsulator.
-    if (library->Find(inference_graph_name) == nullptr) {
-      TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
-    }
-  }
-  return Status::OK();
-}
-
 Status Encapsulator::Subgraph::ReplaceFunctionDef(
     FunctionLibraryDefinition* library) {
   const string& name = function_def_name_;
@@ -1241,214 +670,29 @@ Status Encapsulator::Subgraph::AddFunctionCallNode(
   return Status::OK();
 }
 
-Status Encapsulator::Subgraph::AddHostComputeKeyPlaceholder(
-    OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
-  TensorShapeProto shape_proto;
-  TensorShape shape({2});
-  shape.AsProto(&shape_proto);
-  GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
-  NodeDef key_def;
-  NodeDefBuilder builder(
-      absl::StrCat(call_node_def_.name(), "_key_placeholder"), "Placeholder",
-      NodeDebugInfo(call_node_def_));
-  builder.Attr("dtype", DT_STRING);
-  builder.Attr("shape", shape_proto);
-  builder.Attr("_host_compute_call_node", call_node_def_.name());
-  Status s = builder.Finalize(&key_def);
-  if (!s.ok()) return s;
-
-  host_compute_key_placeholder_ = graph_out->AddNode(key_def, &s);
-  if (!s.ok()) return s;
-  host_compute_key_placeholder_->set_assigned_device_name(device_);
-
-  return Status::OK();
-}
-
-Status Encapsulator::Subgraph::AddRecvAtHostNode(
-    const string& group_attribute, const string& subgraph_name,
-    const string& outside_compilation_attribute, const string& oc_subgraph_name,
-    OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
-  if (host_compute_key_placeholder_ == nullptr) {
-    TF_RETURN_IF_ERROR(AddHostComputeKeyPlaceholder(oc_subgraph, graph_out));
-  }
-
-  std::vector<DataType> dtypes(oc_subgraph->inputs.size(), DT_INVALID);
-
-  for (const auto& input : oc_subgraph->inputs) {
-    const Node* src_node = input.first.node;
-    int src_slot = input.first.index;
-    int input_index = input.second;
-
-    DataType dtype = src_node->output_type(src_slot);
-    dtypes[input_index] = dtype;
-  }
-
-  NodeDef recv_def;
-  // TODO(shikharagarwal): What source node should we use for errors?
-  NodeDefBuilder builder(absl::StrCat("outside_compilation_", subgraph_name,
-                                      "_", oc_subgraph_name, "_recv"),
-                         kRecvAtHostOp);
-  builder.Device(device_);
-  builder.Attr("Toutputs", dtypes);
-  // The correct device_ordinal will be inserted during replication in a
-  // subsequent rewrite.
-  builder.Attr("device_ordinal", 0);
-  builder.Attr("key", absl::StrCat("host_compute_channel_", subgraph_name, "_",
-                                   oc_subgraph_name));
-  builder.Attr(group_attribute, subgraph_name);
-  builder.Attr(outside_compilation_attribute, oc_subgraph_name);
-  builder.Input(host_compute_key_placeholder_->name(), 0, DT_STRING);
-  Status s = builder.Finalize(&recv_def);
-  if (!s.ok()) return s;
-
-  oc_subgraph->recv_at_host = graph_out->AddNode(recv_def, &s);
-  if (!s.ok()) return s;
-  graph_out->AddEdge(host_compute_key_placeholder_, 0,
-                     oc_subgraph->recv_at_host, 0);
-
-  // Add a control dependency forcing the RecvAtHost to run before the subgraph
-  // completes. This has no effect on execution order but prevents the
-  // RecvAtHost being pruned.
-  TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
-  graph_out->AddControlEdge(oc_subgraph->recv_at_host, sequencer_,
-                            true /* skip duplicates check */);
-
-  return Status::OK();
-}
-
-Status Encapsulator::Subgraph::AddSendFromHostNode(
-    const std::unordered_map<const Node*, Node*>& node_images,
-    const string& group_attribute, const string& subgraph_name,
-    const string& outside_compilation_attribute, const string& oc_subgraph_name,
-    OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
-  if (host_compute_key_placeholder_ == nullptr) {
-    TF_RETURN_IF_ERROR(AddHostComputeKeyPlaceholder(oc_subgraph, graph_out));
-  }
-
-  std::vector<DataType> dtypes(oc_subgraph->outputs_by_src.size(), DT_INVALID);
-  std::vector<NodeDefBuilder::NodeOut> inputs(
-      oc_subgraph->outputs_by_src.size());
-
-  for (const auto& output : oc_subgraph->outputs_by_src) {
-    const Node* src_node = output.first.node;
-    Node* src_image = node_images.at(src_node);
-    int src_slot = output.first.index;
-    int output_index = output.second.index;
-
-    DataType dtype = src_node->output_type(src_slot);
-    dtypes[output_index] = dtype;
-    inputs[output_index].Reset(src_image->name(), src_slot, dtype);
-  }
-
-  NodeDef send_def;
-  // TODO(shikharagarwal): What source node should we use for errors?
-  NodeDefBuilder builder(absl::StrCat("outside_compilation_", subgraph_name,
-                                      "_", oc_subgraph_name, "_send"),
-                         kSendFromHostOp);
-  builder.Device(device_);
-  builder.Attr("Tinputs", dtypes);
-  builder.Attr("key", absl::StrCat("host_compute_channel_", subgraph_name, "_",
-                                   oc_subgraph_name));
-  // The correct device_ordinal will be inserted during replication in a
-  // subsequent rewrite.
-  builder.Attr("device_ordinal", 0);
-  builder.Attr(group_attribute, subgraph_name);
-  builder.Attr(outside_compilation_attribute, oc_subgraph_name);
-  builder.Input(inputs);
-  builder.Input(host_compute_key_placeholder_->name(), 0, DT_STRING);
-  Status s = builder.Finalize(&send_def);
-  if (!s.ok()) return s;
-
-  oc_subgraph->send_from_host = graph_out->AddNode(send_def, &s);
-  if (!s.ok()) return s;
-  graph_out->AddEdge(host_compute_key_placeholder_, 0,
-                     oc_subgraph->send_from_host, inputs.size());
-
-  // Add a control dependency forcing the SendFromHost to run before the
-  // subgraph completes. This has no effect on execution order but prevents the
-  // RecvAtHost being pruned.
-  TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
-  graph_out->AddControlEdge(oc_subgraph->send_from_host, sequencer_,
-                            /* allow_duplicates= */ true);
-
-  return Status::OK();
-}
-
-Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
-    const string& group_attribute, const string& subgraph_name,
-    const string& outside_compilation_attribute,
-    const std::unordered_map<const Node*, Node*>& node_images,
-    Graph* graph_out) {
-  for (auto& outside_compilation_subgraph_entry :
-       outside_compilation_subgraphs_) {
-    const string& oc_name = outside_compilation_subgraph_entry.first;
-    OutsideCompilationSubgraph& oc_subgraph =
-        outside_compilation_subgraph_entry.second;
-
-    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
-      TF_RETURN_IF_ERROR(AddRecvAtHostNode(group_attribute, subgraph_name,
-                                           outside_compilation_attribute,
-                                           oc_name, &oc_subgraph, graph_out));
-    }
-
-    if (!oc_subgraph.outputs_by_src.empty() ||
-        !oc_subgraph.control_outputs.empty()) {
-      TF_RETURN_IF_ERROR(AddSendFromHostNode(
-          node_images, group_attribute, subgraph_name,
-          outside_compilation_attribute, oc_name, &oc_subgraph, graph_out));
-    }
-  }
-  return Status::OK();
-}
-
-void Encapsulator::Subgraph::GetOutsideCompilationSubgraphNames(
-    std::vector<string>* names) const {
-  for (auto& entry : outside_compilation_subgraphs_) {
-    names->push_back(entry.first);
-  }
-}
-
-Status Encapsulator::GetFunctionNameAttr(
-    Node const* node, string* attr, string* outside_compilation_attr) const {
+Status Encapsulator::GetFunctionNameAttr(Node const* node, string* attr) const {
   AttrSlice attrs = node->attrs();
   attr->clear();
-  outside_compilation_attr->clear();
   bool found_group_attribute = false;
-  bool found_outside_compilation_attribute = false;
   for (const auto& node_attr : attrs) {
     if (node_attr.first == group_attribute_) {
       TF_RETURN_IF_ERROR(AttrValueHasType(node_attr.second, "string"));
       *attr = node_attr.second.s();
       found_group_attribute = true;
-    } else if (node_attr.first == outside_compilation_attribute_) {
-      TF_RETURN_IF_ERROR(AttrValueHasType(node_attr.second, "string"));
-      *outside_compilation_attr = node_attr.second.s();
-      found_outside_compilation_attribute = true;
+      break;
     }
-    if (found_group_attribute && found_outside_compilation_attribute) break;
-  }
-
-  if (found_outside_compilation_attribute && !found_group_attribute) {
-    return errors::InvalidArgument(
-        "Node ", node->name(), " has ", outside_compilation_attribute_,
-        " attribute but no ", group_attribute_, " attribute.");
-  } else {
-    return Status::OK();
   }
+  return Status::OK();
 }
 
-bool IsInSubgraph(const string& func_id, const string& outside_compilation_id) {
-  return !func_id.empty() && outside_compilation_id.empty();
-}
+bool IsInSubgraph(const string& func_id) { return !func_id.empty(); }
 
 Status Encapsulator::CopySubgraphNodes(
     std::unordered_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
     string func_id;
-    string outside_compilation_id;
-    TF_RETURN_IF_ERROR(
-        GetFunctionNameAttr(node, &func_id, &outside_compilation_id));
-    if (!IsInSubgraph(func_id, outside_compilation_id)) continue;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
+    if (!IsInSubgraph(func_id)) continue;
 
     Subgraph& subgraph = subgraphs_[func_id];
     Node* image = subgraph.MakeNodeImage(graph_in_, node);
@@ -1463,19 +707,14 @@ Status Encapsulator::CopySubgraphEdges(
     std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
   for (const Edge* edge : graph_in_->edges()) {
     string src_func_id;
-    string src_outside_compilation_id;
-    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id,
-                                           &src_outside_compilation_id));
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id));
     string dst_func_id;
-    string dst_outside_compilation_id;
-    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id,
-                                           &dst_outside_compilation_id));
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id));
     Node* src_image = gtl::FindWithDefault(node_images, edge->src(), nullptr);
     Node* dst_image = gtl::FindWithDefault(node_images, edge->dst(), nullptr);
 
     // Copy edges that are local to a subgraph.
-    if (IsInSubgraph(src_func_id, src_outside_compilation_id) &&
-        IsInSubgraph(dst_func_id, dst_outside_compilation_id) &&
+    if (IsInSubgraph(src_func_id) && IsInSubgraph(dst_func_id) &&
         src_func_id == dst_func_id) {
       Graph* g = subgraphs_[src_func_id].GetGraph();
       if (edge->IsControlEdge()) {
@@ -1488,7 +727,7 @@ Status Encapsulator::CopySubgraphEdges(
     }
 
     // Record 'src' as an output of its subgraph, if applicable.
-    if (IsInSubgraph(src_func_id, src_outside_compilation_id)) {
+    if (IsInSubgraph(src_func_id)) {
       if (!edge->IsControlEdge()) {
         DataType dtype = edge->src()->output_type(edge->src_output());
         if (IsRefType(dtype)) {
@@ -1500,23 +739,15 @@ Status Encapsulator::CopySubgraphEdges(
       }
 
       Subgraph& src_subgraph = subgraphs_[src_func_id];
-      if (src_func_id == dst_func_id) {
-        // src is in the subgraph and dst is outside_compilation in the same
-        // subgraph.
-        src_subgraph.RecordOutsideCompilationInputOrControl(
-            dst_outside_compilation_id, edge);
+      if (edge->IsControlEdge()) {
+        TF_RETURN_IF_ERROR(src_subgraph.RecordControlResult(edge, node_images));
       } else {
-        if (edge->IsControlEdge()) {
-          TF_RETURN_IF_ERROR(
-              src_subgraph.RecordControlResult(edge, node_images));
-        } else {
-          TF_RETURN_IF_ERROR(src_subgraph.RecordResult(edge, node_images));
-        }
+        TF_RETURN_IF_ERROR(src_subgraph.RecordResult(edge, node_images));
       }
     }
 
     // Record 'dst' as an input of its subgraph, if applicable.
-    if (IsInSubgraph(dst_func_id, dst_outside_compilation_id)) {
+    if (IsInSubgraph(dst_func_id)) {
       // Look at the type of the destination not the source, since Ref output
       // Tensors can be automatically cast to non-Ref Tensors at the
       // destination.
@@ -1531,18 +762,11 @@ Status Encapsulator::CopySubgraphEdges(
       }
 
       Subgraph& dst_subgraph = subgraphs_[dst_func_id];
-      if (src_func_id == dst_func_id) {
-        // dst is in the subgraph and src is outside_compilation in the same
-        // subgraph.
-        dst_subgraph.RecordOutsideCompilationOutputOrControl(
-            src_outside_compilation_id, edge);
-      } else {
-        // Ignore control edges entering the subgraph. We will lift them onto
-        // the enclosing call operators in BuildOutputGraph().
-        if (!edge->IsControlEdge()) {
-          TF_RETURN_IF_ERROR(
-              dst_subgraph.RecordArg(edge, node_images, src_arg_pairs));
-        }
+      // Ignore control edges entering the subgraph. We will lift them onto
+      // the enclosing call operators in BuildOutputGraph().
+      if (!edge->IsControlEdge()) {
+        TF_RETURN_IF_ERROR(
+            dst_subgraph.RecordArg(edge, node_images, src_arg_pairs));
       }
     }
   }
@@ -1564,16 +788,6 @@ Status Encapsulator::SplitIntoSubgraphs(FunctionLibraryDefinition* library) {
   TF_RETURN_IF_ERROR(CopySubgraphNodes(&node_images));
   TF_RETURN_IF_ERROR(CopySubgraphEdges(node_images, &src_arg_pairs));
 
-  // For each subgraph, add the nodes that deal with inputs and outputs its
-  // nested outside_compilation subgraphs. These could not be added earlier
-  // during CopySubgraphEdges since we need to discover all the types of the
-  // inputs and outputs for an outside_compilation subgraph before creating a
-  // single input and output node for it.
-  for (auto& entry : subgraphs_) {
-    Subgraph& subgraph = entry.second;
-    TF_RETURN_IF_ERROR(subgraph.AddHostComputes(entry.first, node_images));
-  }
-
   MarkGuaranteedConstants(*graph_in_, src_arg_pairs);
 
   for (auto& entry : subgraphs_) {
@@ -1609,12 +823,10 @@ Status Encapsulator::CopyNodesToOutputGraph(
     Graph* graph_out, std::unordered_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
     string func_id;
-    string outside_compilation_id;
-    TF_RETURN_IF_ERROR(
-        GetFunctionNameAttr(node, &func_id, &outside_compilation_id));
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
 
     // Don't copy nodes that are going to be encapsulated.
-    if (IsInSubgraph(func_id, outside_compilation_id)) continue;
+    if (IsInSubgraph(func_id)) continue;
 
     Node* image = graph_out->CopyNode(node);
     (*node_images)[node] = image;
@@ -1634,37 +846,14 @@ Status Encapsulator::AddFunctionCallNodes(
   return Status::OK();
 }
 
-Status Encapsulator::AddOutsideCompilationHostIONodes(
-    const std::unordered_map<const Node*, Node*>& node_images,
-    Graph* graph_out) {
-  for (auto& subgraph_entry : subgraphs_) {
-    const string& subgraph_name = subgraph_entry.first;
-    Subgraph& subgraph = subgraph_entry.second;
-    TF_RETURN_IF_ERROR(subgraph.AddOutsideCompilationHostIONodes(
-        group_attribute_, subgraph_name, outside_compilation_attribute_,
-        node_images, graph_out));
-  }
-  return Status::OK();
-}
-
 Status Encapsulator::FindOutputImageOfEdgeSrc(
-    const string& src_func_id, const string& src_outside_compilation_id,
-    const string& dst_func_id, const string& dst_outside_compilation_id,
+    const string& src_func_id, const string& dst_func_id,
     const std::unordered_map<const Node*, Node*>& node_images,
     const Node* original_src_node, Node** src_image) {
-  if (IsInSubgraph(src_func_id, src_outside_compilation_id)) {
-    if (dst_func_id == src_func_id) {
-      // The edge is from a subgraph to an outside_compilation cluster in the
-      // same subgraph so use the appropriate _RecvAtHost node in the output
-      // graph.
-      TF_RET_CHECK(!dst_outside_compilation_id.empty());
-      *src_image = subgraphs_.at(src_func_id)
-                       .GetRecvAtHostNode(dst_outside_compilation_id);
-    } else {
-      // The edge is from a subgraph to a regular node in the output graph so
-      // use the subgraph's call node output.
-      *src_image = subgraphs_.at(src_func_id).GetCallNode();
-    }
+  if (IsInSubgraph(src_func_id)) {
+    // The edge is from a subgraph to a regular node in the output graph so
+    // use the subgraph's call node output.
+    *src_image = subgraphs_.at(src_func_id).GetCallNode();
   } else {
     // The source of the edge is in the output graph so use the node image in
     // the output graph.
@@ -1673,21 +862,14 @@ Status Encapsulator::FindOutputImageOfEdgeSrc(
   return Status::OK();
 }
 
-int Encapsulator::FindOutputSlotOfEdgeSrc(
-    const string& src_func_id, const string& src_outside_compilation_id,
-    const string& dst_func_id, const string& dst_outside_compilation_id,
-    const Edge* edge) {
-  if (IsInSubgraph(src_func_id, src_outside_compilation_id)) {
+int Encapsulator::FindOutputSlotOfEdgeSrc(const string& src_func_id,
+                                          const string& dst_func_id,
+                                          const Edge* edge) {
+  if (IsInSubgraph(src_func_id)) {
     const Subgraph& src_subgraph = subgraphs_.at(src_func_id);
-    if (src_func_id == dst_func_id) {
-      // 'src' is in a subgraph and 'dst' is outside_compilation in the same
-      // subgraph. Use the corresponding _RecvAtHost output instead.
-      return src_subgraph.GetRecvAtHostSlot(dst_outside_compilation_id, edge);
-    } else {
-      // 'src' is in a subgraph and 'dst' is a regular node in the output
-      // graph. Use the corresponding call output instead.
-      return src_subgraph.GetResultIndexForEdge(edge);
-    }
+    // 'src' is in a subgraph and 'dst' is a regular node in the output
+    // graph. Use the corresponding call output instead.
+    return src_subgraph.GetResultIndexForEdge(edge);
   } else {
     // The source of the edge is in the output graph so use the regular edge
     // slot.
@@ -1696,23 +878,13 @@ int Encapsulator::FindOutputSlotOfEdgeSrc(
 }
 
 Status Encapsulator::FindOutputImageOfEdgeDst(
-    const string& src_func_id, const string& src_outside_compilation_id,
-    const string& dst_func_id, const string& dst_outside_compilation_id,
+    const string& src_func_id, const string& dst_func_id,
     const std::unordered_map<const Node*, Node*>& node_images,
     const Node* original_dst_node, Node** dst_image) {
-  if (IsInSubgraph(dst_func_id, dst_outside_compilation_id)) {
-    if (src_func_id == dst_func_id) {
-      // The edge is to a subgraph from an outside_compilation cluster in the
-      // same subgraph so use the appropriate _SendFromHost node in the output
-      // graph.
-      TF_RET_CHECK(!src_outside_compilation_id.empty());
-      *dst_image = subgraphs_.at(dst_func_id)
-                       .GetSendFromHostNode(src_outside_compilation_id);
-    } else {
-      // The edge is to a subgraph from a regular node in the output graph so
-      // use the subgraph's call node input.
-      *dst_image = subgraphs_.at(dst_func_id).GetCallNode();
-    }
+  if (IsInSubgraph(dst_func_id)) {
+    // The edge is to a subgraph from a regular node in the output graph so
+    // use the subgraph's call node input.
+    *dst_image = subgraphs_.at(dst_func_id).GetCallNode();
   } else {
     // The destination of the edge is in the output graph so use the node image
     // in the output graph.
@@ -1721,21 +893,14 @@ Status Encapsulator::FindOutputImageOfEdgeDst(
   return Status::OK();
 }
 
-int Encapsulator::FindOutputSlotOfEdgeDst(
-    const string& src_func_id, const string& src_outside_compilation_id,
-    const string& dst_func_id, const string& dst_outside_compilation_id,
-    const Edge* edge) {
-  if (IsInSubgraph(dst_func_id, dst_outside_compilation_id)) {
+int Encapsulator::FindOutputSlotOfEdgeDst(const string& src_func_id,
+                                          const string& dst_func_id,
+                                          const Edge* edge) {
+  if (IsInSubgraph(dst_func_id)) {
     const Subgraph& dst_subgraph = subgraphs_.at(dst_func_id);
-    if (dst_func_id == src_func_id) {
-      // 'dst' is in a subgraph and 'src' is outside_compilation in the same
-      // subgraph. Use the corresponding _SendFromHost input instead.
-      return dst_subgraph.GetSendFromHostSlot(src_outside_compilation_id, edge);
-    } else {
       // 'dst' is in a subgraph and 'src' is a regular node in the output
       // graph. Use the corresponding call input instead.
       return dst_subgraph.GetArgIndexForEdge(edge);
-    }
   } else {
     // The destination of the edge is in the output graph so use the regular
     // edge slot.
@@ -1744,20 +909,16 @@ int Encapsulator::FindOutputSlotOfEdgeDst(
 }
 
 Status Encapsulator::CopyEdgeToOutputGraph(
-    const Edge* edge, const string& src_func_id,
-    const string& src_outside_compilation_id, const string& dst_func_id,
-    const string& dst_outside_compilation_id,
+    const Edge* edge, const string& src_func_id, const string& dst_func_id,
     const std::unordered_map<const Node*, Node*>& node_images, Graph* graph_out,
     std::unordered_set<std::pair<OutputTensor, InputTensor>,
                        OutputInputTensorPairHasher>* edges_added) {
   Node* src_image;
   TF_RETURN_IF_ERROR(FindOutputImageOfEdgeSrc(
-      src_func_id, src_outside_compilation_id, dst_func_id,
-      dst_outside_compilation_id, node_images, edge->src(), &src_image));
+      src_func_id, dst_func_id, node_images, edge->src(), &src_image));
   Node* dst_image;
   TF_RETURN_IF_ERROR(FindOutputImageOfEdgeDst(
-      src_func_id, src_outside_compilation_id, dst_func_id,
-      dst_outside_compilation_id, node_images, edge->dst(), &dst_image));
+      src_func_id, dst_func_id, node_images, edge->dst(), &dst_image));
 
   // If this is a control edge then copy it and return. Lift control edges onto
   // the enclosing call operator.
@@ -1774,13 +935,9 @@ Status Encapsulator::CopyEdgeToOutputGraph(
     return Status::OK();
   }
 
-  int src_output =
-      FindOutputSlotOfEdgeSrc(src_func_id, src_outside_compilation_id,
-                              dst_func_id, dst_outside_compilation_id, edge);
+  int src_output = FindOutputSlotOfEdgeSrc(src_func_id, dst_func_id, edge);
 
-  int dst_input =
-      FindOutputSlotOfEdgeDst(src_func_id, src_outside_compilation_id,
-                              dst_func_id, dst_outside_compilation_id, edge);
+  int dst_input = FindOutputSlotOfEdgeDst(src_func_id, dst_func_id, edge);
 
   // Add the edge, if we have not already added it.
   if (edges_added
@@ -1792,18 +949,6 @@ Status Encapsulator::CopyEdgeToOutputGraph(
   return Status::OK();
 }
 
-Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) {
-  for (const auto& ancestors : subgraph_ancestors_) {
-    const string& subgraph = ancestors.first;
-    for (const string& ancestor : ancestors.second) {
-      graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNode(),
-                                subgraphs_[subgraph].GetCallNode(),
-                                /* allow_duplicates= */ true);
-    }
-  }
-  return Status::OK();
-}
-
 Status Encapsulator::AddEdgesToOutputGraph(
     const std::unordered_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
@@ -1816,18 +961,13 @@ Status Encapsulator::AddEdgesToOutputGraph(
 
   for (const Edge* edge : graph_in_->edges()) {
     string src_func_id;
-    string src_outside_compilation_id;
-    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id,
-                                           &src_outside_compilation_id));
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id));
     string dst_func_id;
-    string dst_outside_compilation_id;
-    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id,
-                                           &dst_outside_compilation_id));
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id));
 
     // Ignore edges that are strictly contained within one subgraph, unless
     // we are constructing parallel check graphs.
-    if (IsInSubgraph(src_func_id, src_outside_compilation_id) &&
-        IsInSubgraph(dst_func_id, dst_outside_compilation_id) &&
+    if (IsInSubgraph(src_func_id) && IsInSubgraph(dst_func_id) &&
         src_func_id == dst_func_id) {
       continue;
     }
@@ -1835,15 +975,13 @@ Status Encapsulator::AddEdgesToOutputGraph(
     // We have an edge that crosses a cluster boundary or is entirely within the
     // unclustered graph.
     TF_RETURN_IF_ERROR(CopyEdgeToOutputGraph(
-        edge, src_func_id, src_outside_compilation_id, dst_func_id,
-        dst_outside_compilation_id, node_images, graph_out, &edges_added));
+        edge, src_func_id, dst_func_id, node_images, graph_out, &edges_added));
   }
 
   for (auto& subgraph_entry : subgraphs_) {
     Subgraph& subgraph = subgraph_entry.second;
     subgraph.ConnectSequencerToCallNode(graph_out);
   }
-  TF_RETURN_IF_ERROR(AddCallNodeDependencies(graph_out));
 
   return Status::OK();
 }
@@ -1893,413 +1031,8 @@ Node* AddDummyShapedNode(const Node* src_node, int src_port,
   return node;
 }
 
-// Adds a copy of node_in to graph_out and adds the mapping to
-// copied_node_images.
-Status CopyShapeInferenceNodeToGraph(
-    Node* node_in, const Node* send_node,
-    const std::unordered_map<Node*, Node*>& dummy_node_images,
-    FunctionLibraryDefinition* library,
-    std::unordered_map<Node*, Node*>* copied_node_images, Graph* graph_out) {
-  // Once all the ancestor nodes have been added to graph_out, add this node
-  // and connect it to its ancestors.
-  Node* node_out = graph_out->CopyNode(node_in);
-  (*copied_node_images)[node_in] = node_out;
-  // Don't bother to build the shape inference graph if there's a node with no
-  // shape inference function, since it would just result in an error later at
-  // compile time.
-  const OpRegistrationData* op_reg_data;
-  TF_RETURN_IF_ERROR(library->LookUp(node_in->type_string(), &op_reg_data));
-  if (op_reg_data->shape_inference_fn == nullptr) {
-    return errors::InvalidArgument(
-        "Shape inference is not possible for outside_compilation "
-        "SendFromHost node ",
-        send_node->name(), " because it depends on node ", node_in->name(),
-        " which does not have a shape inference function registered.");
-  }
-  // Add all the edges to the newly copied node.
-  for (const Edge* in_edge : node_in->in_edges()) {
-    if (!in_edge->IsControlEdge()) {
-      Node* src = in_edge->src();
-      const auto iter = dummy_node_images.find(src);
-      if (iter == dummy_node_images.end()) {
-        // The src is a copied node so use the original output port.
-        graph_out->AddEdge((*copied_node_images)[in_edge->src()],
-                           in_edge->src_output(), node_out,
-                           in_edge->dst_input());
-      } else {
-        // The src is a dummy node so use output port 0.
-        graph_out->AddEdge(iter->second, 0, node_out, in_edge->dst_input());
-      }
-    }
-  }
-  // Work around the fact that Enter nodes refuse to propagate shape information
-  // unless they are marked loop invariant. Since we are never going to execute
-  // this graph, marking them all loop invariant is fine.
-  if (node_out->type_string() == "Enter") {
-    node_out->ClearAttr("is_constant");
-    node_out->AddAttr("is_constant", true);
-  }
-  return Status::OK();
-}
-
 }  // namespace
 
-Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
-    const Graph& graph_in, const BackEdgeHelper& back_edge_helper,
-    const ShapeRefiner& shape_refiner,
-    const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
-    FunctionLibraryDefinition* library,
-    std::vector<TensorShapeProto>* static_shape_out,
-    std::unique_ptr<Graph>* graph_out) {
-  // Get the control flow structure of the input graph so we can build
-  // well-formed output graphs.
-  std::vector<ControlFlowInfo> control_flow_info;
-  TF_RETURN_IF_ERROR(BuildControlFlowInfo(&graph_in, &control_flow_info));
-
-  // Maps from nodes in graph_in to nodes in graph_out.
-  //
-  // When an edge has fully defined shape the source node in graph_in is
-  // replaced in graph_out by a dummy constant node. The mapping from nodes
-  // in graph_in to dummy nodes is stored in dummy_node_images.
-  //
-  // When a node in graph_in has at least one ancestor that doesn't have fully
-  // defined shape, it is copied into graph_out. The mapping from nodes in
-  // graph_in to copied nodes is stored in copied_node_images.
-  //
-  // The two types of node are treated differently because, when adding edges to
-  // graph_out, an output from a dummy node always uses port 0, whereas an
-  // output from a copied node uses the same port that was used in graph_in.
-  std::unordered_map<Node*, Node*> dummy_node_images;
-  std::unordered_map<Node*, Node*> copied_node_images;
-
-  graph_out->reset(new Graph(graph_in.op_registry()));
-  (*graph_out)->set_versions(graph_in.versions());
-  // The final input to the send node is the dynamic key, which we don't include
-  // in the static shapes.
-  static_shape_out->resize(send_node->num_inputs() - 1);
-
-  // We don't use the standard ReverseDFS because we want to cut off traversal
-  // whenever we find an output with fully defined shape.
-  struct Work {
-    Node* node;
-    bool leave;  // Are we entering or leaving node?
-  };
-  std::vector<Work> stack({{send_node, false}});
-  std::vector<bool> visited(graph_in.num_node_ids(), false);
-  while (!stack.empty()) {
-    Work w = stack.back();
-    stack.pop_back();
-    Node* n = w.node;
-
-    if (w.leave) {
-      TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph(
-          n, send_node, dummy_node_images, library, &copied_node_images,
-          graph_out->get()));
-    } else {
-      if (visited[n->id()]) continue;
-      visited[n->id()] = true;
-
-      // Arrange to revisit when all done with all inputs.
-      stack.push_back(Work{n, true});
-
-      bool has_parent_with_unknown_shape = false;
-      for (const Edge* in_edge : n->in_edges()) {
-        if (!in_edge->IsControlEdge()) {
-          Node* src_node = in_edge->src();
-          int src_port = in_edge->src_output();
-          shape_inference::InferenceContext* context =
-              shape_refiner.GetContext(src_node);
-          shape_inference::ShapeHandle shape = context->output(src_port);
-          if (context->FullyDefined(shape)) {
-            // This ancestor has known shape, so instead of adding it to the
-            // stack, add a dummy node with that shape to graph_out and
-            // continue.
-            TensorShapeProto proto;
-            context->ShapeHandleToProto(shape, &proto);
-            VLOG(2) << "Node " << src_node->name()
-                    << " has known shape: " << proto.DebugString();
-            if (dummy_node_images.find(src_node) == dummy_node_images.end()) {
-              dummy_node_images[src_node] =
-                  AddDummyShapedNode(src_node, src_port, control_flow_info,
-                                     proto, graph_out->get());
-            }
-            // The final input to the send node is the dynamic key, which we
-            // don't include in the static shapes.
-            if (n == send_node &&
-                in_edge->dst_input() < static_shape_out->size()) {
-              (*static_shape_out)[in_edge->dst_input()] = proto;
-            }
-          } else {
-            has_parent_with_unknown_shape = true;
-            if (!visited[src_node->id()]) {
-              if (VLOG_IS_ON(2)) {
-                TensorShapeProto proto;
-                context->ShapeHandleToProto(shape, &proto);
-                VLOG(2) << "Node " << src_node->name()
-                        << " has unknown shape: " << proto.DebugString();
-              }
-              stack.push_back({src_node, false});
-            }
-          }
-        }
-      }
-      if (!has_parent_with_unknown_shape) {
-        if (n == send_node) {
-          // The shapes of all the inputs to send_node are statically known. We
-          // won't have to do any inference at compile time so return now: the
-          // shapes were stored in static_shape_out above.
-          graph_out->reset();
-          return Status::OK();
-        } else {
-          // Any shape that is being processed is either the original send node
-          // or has at least one output with statically-unknown shape. If the
-          // latter and it doesn't have any inputs with statically-unknown
-          // shape, then check that it is of the recv nodes that we can fill in
-          // the shape of at run-time later. If it isn't one of those, then we
-          // won't have any additional knowledge at compile time, so we already
-          // know we won't be able to do shape inference and we can return an
-          // error now.
-          if (recv_at_host_nodes.find(n->name()) == recv_at_host_nodes.end()) {
-            return errors::InvalidArgument(
-                "Shape inference is not possible for outside_compilation "
-                "SendFromHost node ",
-                send_node->name(), " because shape of node ",
-                FormatNodeForError(*n),
-                " will not be known at compilation time.");
-          }
-        }
-      }
-    }
-  }
-
-  for (const auto edge : back_edge_helper.RemovedEdges()) {
-    if (copied_node_images.find(edge.dst) != copied_node_images.end()) {
-      // The destination of this back edge was added to the inference graph, so
-      // fix it up.
-      Node* dst = copied_node_images[edge.dst];
-      if (dst->type_string() != "Merge") {
-        return errors::InvalidArgument(
-            "outside_compilation cluster contains a back-edge to node ",
-            dst->name(), " of type ", dst->type_string(),
-            ". The analysis pass only supports back-edges to Merge nodes.");
-      }
-      const Edge* existing_input_edge;
-      if (edge.dst_input != 1 || dst->num_inputs() != 2 ||
-          !dst->input_edge(0, &existing_input_edge).ok()) {
-        // TODO(misard) if we see graphs built with a different structure, relax
-        // this constraint. Leaving it here for now to avoid writing unnecessary
-        // complex code since we believe graphs generated by front ends all have
-        // the back edge as the second input to the merge node.
-        return errors::Internal(
-            "Internal assumption failed while rewriting an outside_compilation "
-            "cluster that contains a while loop. Logic assumes back-edge is to "
-            "port 1 of a 2-input Merge node.");
-      }
-      // Connect the existing edge to both inputs of the Merge node so that the
-      // graph will be well-formed.
-      (*graph_out)
-          ->AddEdge(existing_input_edge->src(),
-                    existing_input_edge->src_output(), dst, edge.dst_input);
-    }
-  }
-
-  return Status::OK();
-}
-
-namespace {
-
-// Helper struct for building cluster dependencies and also debugging cycles in
-// the dependencies. While computing dependencies we construct a mapping from
-// Node* to PathDetails.
-struct PathDetails {
-  struct SubgraphAndCluster {
-    string subgraph;
-    string outside_compilation_cluster;
-    bool operator==(const SubgraphAndCluster& other) const {
-      return subgraph == other.subgraph &&
-             outside_compilation_cluster == other.outside_compilation_cluster;
-    }
-  };
-
-  struct SubgraphAndClusterHash {
-    inline std::size_t operator()(const SubgraphAndCluster& v) const {
-      return hash<string>()(
-          absl::StrCat(v.subgraph, v.outside_compilation_cluster));
-    }
-  };
-
-  typedef std::unordered_set<SubgraphAndCluster, SubgraphAndClusterHash>
-      SubgraphAndClusterSet;
-
-  // Returns the set of (subgraph, oc_cluster) pairs that should be recorded as
-  // ancestors for any successor of this node. If the node is in the outer
-  // graph, it returns the transitive union of the ancestors of the node's
-  // inputs. If the node is in an outside_compilation cluster, it returns just
-  // that cluster. If the node is compiled, it returns the empty set.
-  SubgraphAndClusterSet AncestorsForSuccessor() {
-    if (subgraph.empty()) {
-      return ancestor_clusters;
-    } else if (outside_compilation_cluster.empty()) {
-      return SubgraphAndClusterSet();
-    } else {
-      SubgraphAndCluster entry;
-      entry.subgraph = subgraph;
-      entry.outside_compilation_cluster = outside_compilation_cluster;
-      return SubgraphAndClusterSet({entry});
-    }
-  }
-
-  // The transitive union of the ancestor's of this node's inputs. This is only
-  // saved for debugging in order to print out enough information to debug a
-  // discovered cycle.
-  SubgraphAndClusterSet ancestor_clusters;
-  // The subgraph attr on this node.
-  string subgraph;
-  // The outside_compilation attr on this node.
-  string outside_compilation_cluster;
-};
-
-// Adds an edge from ancestor to successor to the cycle detector, and returns an
-// error if that edge causes the formation of a cycle. In the error case, logs
-// the contents of the node_ancestors_map to facilitate debugging.
-Status CheckClusterDependencyForCycles(
-    const string& ancestor, const string& successor,
-    const std::unordered_map<string, std::unordered_set<string>>& ancestors,
-    const std::unordered_map<Node*, PathDetails>& node_ancestors_map,
-    GraphCycles* cycle_detector,
-    std::unordered_map<string, int>* cycle_detector_map) {
-  if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) {
-    (*cycle_detector_map)[ancestor] = cycle_detector->NewNode();
-  }
-  if (cycle_detector_map->find(successor) == cycle_detector_map->end()) {
-    (*cycle_detector_map)[successor] = cycle_detector->NewNode();
-  }
-
-  if (!cycle_detector->InsertEdge((*cycle_detector_map)[ancestor],
-                                  (*cycle_detector_map)[successor])) {
-    LOG(ERROR) << "Cycle in outside_compilation clusters";
-    for (const auto& cluster : ancestors) {
-      LOG(ERROR) << "Cluster " << cluster.first << " depends on:";
-      for (const auto& ancestor : cluster.second) {
-        LOG(ERROR) << "  " << ancestor;
-      }
-    }
-    for (const auto& node_ancestors : node_ancestors_map) {
-      LOG(ERROR) << "Node " << node_ancestors.first->name() << " ("
-                 << node_ancestors.second.subgraph << ";"
-                 << node_ancestors.second.outside_compilation_cluster
-                 << ") has ancestor clusters:";
-      for (const auto& ancestor : node_ancestors.second.ancestor_clusters) {
-        LOG(ERROR) << "  " << ancestor.subgraph << ";"
-                   << ancestor.outside_compilation_cluster;
-      }
-    }
-    return errors::InvalidArgument(
-        "Can't compile outside_compilation clusters because there is a "
-        "dependency cycle: see error log for details.");
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-Status Encapsulator::FindClusterDependencies() {
-  // Map from nodes to ancestor details. A node is entered into the map if it is
-  // in a compilation subgraph, and outside_compilation cluster, or appears on a
-  // path in the outer graph leading from an outside_compilation subgraph.
-  std::unordered_map<Node*, PathDetails> node_ancestors_map;
-  // We check that clusters are acyclic using this cycle detector.
-  GraphCycles cycle_detector;
-  // Map from cluster name to cycle detector node id.
-  std::unordered_map<string, int> cycle_detector_map;
-  // Process the nodes in topologically-sorted order.
-  std::vector<Node*> nodes;
-  GetReversePostOrder(*graph_in_, &nodes);
-  for (Node* node : nodes) {
-    string subgraph_name;
-    string oc_cluster;
-    TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &subgraph_name, &oc_cluster));
-    // First create an entry in the ancestors map if the node is in a compiled
-    // subgraph or outside_compilation cluster, or if any incoming edge is from
-    // a node with an ancestor map entry; and find the union of all the
-    // ancestors.
-    if (!subgraph_name.empty()) {
-      node_ancestors_map[node].subgraph = subgraph_name;
-      node_ancestors_map[node].outside_compilation_cluster = oc_cluster;
-    }
-    for (Node* src : node->in_nodes()) {
-      const auto iter = node_ancestors_map.find(src);
-      if (iter != node_ancestors_map.end()) {
-        const auto& ancestors_to_follow = iter->second.AncestorsForSuccessor();
-        for (const auto& ancestor : ancestors_to_follow) {
-          if (ancestor.subgraph != subgraph_name ||
-              ancestor.outside_compilation_cluster != oc_cluster) {
-            node_ancestors_map[node].ancestor_clusters.insert(ancestor);
-          }
-        }
-      }
-    }
-    if (!subgraph_name.empty()) {
-      // The node is in a compiled subgraph or an outside_compilation cluster.
-      if (oc_cluster.empty()) {
-        // The node is not in an outside_compilation cluster. Record the
-        // subgraph's ancestor dependencies.
-        for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) {
-          if (cluster.subgraph != subgraph_name) {
-            subgraph_ancestors_[subgraph_name].insert(cluster.subgraph);
-            TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
-                cluster.subgraph, subgraph_name, subgraph_ancestors_,
-                node_ancestors_map, &cycle_detector, &cycle_detector_map));
-          }
-        }
-      } else {
-        Subgraph& subgraph = subgraphs_[subgraph_name];
-        // The node is in an outside_compilation cluster. Record the cluster
-        // and/or subgraph ancestor dependencies.
-        for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) {
-          if (cluster.subgraph == subgraph_name) {
-            // The ancestor is in the same subgraph.
-            if (cluster.outside_compilation_cluster != oc_cluster) {
-              // But not in the same oc_cluster, so record the dependency.
-              subgraph.RecordOutsideCompilationDependency(
-                  oc_cluster, cluster.outside_compilation_cluster);
-              TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
-                  cluster.outside_compilation_cluster, oc_cluster,
-                  subgraph.OutsideCompilationAncestorMap(), node_ancestors_map,
-                  &cycle_detector, &cycle_detector_map));
-            }
-          } else {
-            // The ancestor is in a different subgraph, so record the
-            // dependency.
-            subgraph_ancestors_[subgraph_name].insert(cluster.subgraph);
-            TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles(
-                cluster.subgraph, subgraph_name, subgraph_ancestors_,
-                node_ancestors_map, &cycle_detector, &cycle_detector_map));
-          }
-        }
-      }
-    }
-  }
-  if (VLOG_IS_ON(2)) {
-    // Print debug information.
-    VLOG(2) << "node_ancestors_map:";
-    for (const auto& node_iter : node_ancestors_map) {
-      VLOG(2) << "\t" << node_iter.first->name() << ": subgraph = '"
-              << node_iter.second.subgraph
-              << "', outside_compilation_cluster = '"
-              << node_iter.second.outside_compilation_cluster
-              << "', ancestor_clusters: "
-              << (node_iter.second.ancestor_clusters.empty() ? "(empty)" : "");
-      for (const auto& cluster_iter : node_iter.second.ancestor_clusters) {
-        VLOG(2) << "\t\tsubgraph = '" << cluster_iter.subgraph
-                << "', outside_compilation_cluster = '"
-                << cluster_iter.outside_compilation_cluster << "'";
-      }
-    }
-  }
-  return Status::OK();
-}
-
 Status Encapsulator::MakePrunedGraphCopyAndInline(
     const Graph& graph, const std::vector<Node*>& sink_nodes,
     std::unique_ptr<Graph>* pruned_graph,
@@ -2362,118 +1095,6 @@ Status Encapsulator::MakePrunedGraphCopyAndInline(
   return Status::OK();
 }
 
-Status Encapsulator::MakeGraphForOutsideCompilationSends(
-    const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
-    BackEdgeHelper* back_edge_helper, ShapeRefiner* shape_refiner,
-    std::unordered_map<const Node*, Node*>* node_images,
-    FunctionLibraryDefinition* library) {
-  // Find all the send_from_host nodes in all subgraphs, to use as roots for the
-  // pruning.
-  std::vector<Node*> send_from_host_nodes;
-  for (auto& subgraph_entry : subgraphs_) {
-    Subgraph& subgraph = subgraph_entry.second;
-    std::vector<string> outside_compilation_names;
-    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
-    for (const auto& name : outside_compilation_names) {
-      Node* send_node = subgraph.GetSendFromHostNode(name);
-      if (send_node != nullptr) {
-        send_from_host_nodes.push_back(send_node);
-      }
-    }
-  }
-
-  // Make a copy of all the graph nodes needed to evaluate the send_from_host
-  // nodes, inlining any functions as needed.
-  TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
-      graph, send_from_host_nodes, pruned_graph, node_images, library));
-  FixupSourceAndSinkEdges(pruned_graph->get());
-
-  // Remove back edges from any cycles in the pruned graph to simplify shape
-  // inference traversal. They will be fixed up in the per-subgraph shape
-  // inference graphs stored in the function library.
-  TF_RETURN_IF_ERROR(back_edge_helper->Remove(pruned_graph->get()));
-
-  // Perform shape inference on the pruned graph.
-  shape_refiner->set_require_shape_inference_fns(false);
-  std::vector<Node*> post_order;
-  GetReversePostOrder(*(*pruned_graph), &post_order);
-  for (auto node : post_order) {
-    // Ignore the status returned by the shape_refiner. At this point we want
-    // the best effort shapes, even if no shape function is registered for a
-    // node.
-    Status status = shape_refiner->AddNode(node);
-    if (!status.ok()) {
-      VLOG(1) << "Shape inference failed for node: " << status;
-    }
-  }
-
-  return Status::OK();
-}
-
-Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
-    Graph* graph_out, FunctionLibraryDefinition* library) {
-  BackEdgeHelper back_edge_helper;
-  std::unique_ptr<Graph> pruned_graph;
-  ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
-  std::unordered_map<const Node*, Node*> node_images;
-  TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
-      *graph_out, &pruned_graph, &back_edge_helper, &shape_refiner,
-      &node_images, library));
-
-  if (VLOG_IS_ON(1)) {
-    DumpGraphToFile("pruned_graph_for_shape_inference", *pruned_graph, library);
-  }
-
-  for (auto& subgraph_entry : subgraphs_) {
-    const string& subgraph_name = subgraph_entry.first;
-    Subgraph& subgraph = subgraph_entry.second;
-    // Find all the recv_at_host nodes in this subgraph.
-    std::vector<string> outside_compilation_names;
-    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
-    std::unordered_set<string> recv_at_host_names;
-    for (const auto& oc_name : outside_compilation_names) {
-      Node* recv_node = subgraph.GetRecvAtHostNode(oc_name);
-      if (recv_node != nullptr) {
-        recv_at_host_names.insert(recv_node->name());
-      }
-    }
-    // For each send_from_host node, do as much shape inference as possible
-    // without knowing the shape of the recv_at_host nodes, and store the
-    // result, along with enough information to complete the job at compile time
-    // once the recv_at_host shapes are known.
-    for (const auto& oc_name : outside_compilation_names) {
-      Node* send_node = subgraph.GetSendFromHostNode(oc_name);
-      std::vector<TensorShapeProto> static_shape;
-      std::unique_ptr<Graph> graph;
-      if (send_node != nullptr) {
-        TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
-            *pruned_graph, back_edge_helper, shape_refiner, recv_at_host_names,
-            node_images[send_node], library, &static_shape, &graph));
-        if (graph == nullptr) {
-          VLOG(2) << "Send node  " << send_node->name() << " shapes";
-          for (int i = 0; i < static_shape.size(); ++i) {
-            VLOG(2) << static_shape[i].DebugString();
-          }
-        } else {
-          if (VLOG_IS_ON(2)) {
-            GraphDef graphdef;
-            graph->ToGraphDef(&graphdef);
-            VLOG(2) << "Send node " << send_node->name() << " graph\n"
-                    << graphdef.DebugString();
-          }
-        }
-      }
-      TF_RETURN_IF_ERROR(subgraph.AddShapeInferenceInfo(
-          subgraph_name, oc_name, static_shape, graph.get(), library));
-    }
-    if (!outside_compilation_names.empty()) {
-      TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library));
-    }
-  }
-
-  return Status::OK();
-}
-
 Status Encapsulator::BuildOutputGraph(Graph* graph_out,
                                       FunctionLibraryDefinition* library) {
   // Map from nodes in the input graph to nodes in the output graph.
@@ -2481,26 +1102,19 @@ Status Encapsulator::BuildOutputGraph(Graph* graph_out,
 
   TF_RETURN_IF_ERROR(CopyNodesToOutputGraph(graph_out, &node_images));
   TF_RETURN_IF_ERROR(AddFunctionCallNodes(node_images, graph_out));
-  TF_RETURN_IF_ERROR(AddOutsideCompilationHostIONodes(node_images, graph_out));
   TF_RETURN_IF_ERROR(AddEdgesToOutputGraph(node_images, graph_out));
 
-  TF_RETURN_IF_ERROR(
-      GetShapeInfoForOutsideCompilationSends(graph_out, library));
-
   return Status::OK();
 }
 
 }  // anonymous namespace
 
 Status EncapsulateSubgraphsInFunctions(
-    string group_attribute, string outside_compilation_attribute,
-    const Graph& graph_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
-    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
-    FunctionLibraryDefinition* library) {
+    string group_attribute, const Graph& graph_in,
+    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
+    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library) {
   Encapsulator encapsulator(std::move(group_attribute),
-                            std::move(outside_compilation_attribute),
                             &graph_in);
-  TF_RETURN_IF_ERROR(encapsulator.FindClusterDependencies());
   TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs(library));
 
   TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs(
@@ -2685,9 +1299,8 @@ Status EncapsulateSubgraphsPass::Run(
 
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       EncapsulateSubgraphsInFunctions(
-          kXlaClusterAttr, kXlaOutsideCompilationAttr, **options.graph,
-          rewrite_subgraph, /*reuse_existing_functions=*/false, &graph_out,
-          library),
+          kXlaClusterAttr, **options.graph, rewrite_subgraph,
+          /*reuse_existing_functions=*/false, &graph_out, library),
       "EncapsulateSubgraphsPass failed");
 
   if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index 90354a801af..62b752cf40f 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -52,16 +52,6 @@ typedef std::function<Status(
 // 'group_attribute' must be a string valued-attribute that names the new
 // functions to introduce.
 //
-// 'outside_compilation_attribute' must be a string-valued attribute that is
-// used to tag nodes within a subgraph to be part of an 'outside_compilation'
-// cluster within the subgraph. A cluster is formed from the set of nodes with
-// the same value of outside_compilation_subgraph and group_attribute. The nodes
-// in an outside_compilation cluster are left in the original graph. Edges
-// crossing from the subgraph to an outside_compilation cluster nested in the
-// subgraph are lifted into a SendToHost/RecvAtHost pair of nodes, and edges
-// crossing from an outside_compilation cluster into its enclosing subgraph are
-// lifted into a SendFromHost/RecvFromHost pair of nodes.
-//
 // If 'rewrite_subgraph_fn' is set, it is applied to each subgraph before
 // function conversion.
 //
@@ -74,10 +64,9 @@ typedef std::function<Status(
 // dep from B. Originally D must run after C, post-transformation this
 // dependency is lost.
 Status EncapsulateSubgraphsInFunctions(
-    string group_attribute, string outside_compilation_attribute,
-    const Graph& graph_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
-    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
-    FunctionLibraryDefinition* library);
+    string group_attribute, const Graph& graph_in,
+    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
+    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
 
 // The attribute that marks function calls produced by the encapsulate
 // subgraphs pass and that should in turn be compiled via XlaLaunch operators.
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 958b0a5f61c..d162c16cc16 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -514,10 +514,10 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
   auto flr = pflr->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
 
   std::unique_ptr<Graph> graph_out;
-  s = EncapsulateSubgraphsInFunctions(
-      "_encapsulate", /*outside_compilation_attribute=*/"", *graph,
-      /*rewrite_subgraph_fn=*/{},
-      /*reuse_existing_functions=*/false, &graph_out, lib_def.get());
+  s = EncapsulateSubgraphsInFunctions("_encapsulate", *graph,
+                                      /*rewrite_subgraph_fn=*/{},
+                                      /*reuse_existing_functions=*/false,
+                                      &graph_out, lib_def.get());
   if (!s.ok()) return s;
 
   std::unordered_map<string, XlaClusterInfo> clusters;
@@ -746,7 +746,7 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
-      "_cluster", "", graph_before_encapsulation,
+      "_cluster", graph_before_encapsulation,
       /*rewrite_subgraph_fn=*/{},
       /*reuse_existing_functions=*/false, &graph, &library));
 
@@ -798,7 +798,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   int guaranteed_consts = 0;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
-      "_encapsulate", "", graph_before,
+      "_encapsulate", graph_before,
       /*rewrite_subgraph_fn=*/
       [&guaranteed_consts](const std::vector<OutputTensor>& arg_source_tensors,
                            std::unique_ptr<Graph>* graph_ptr,
@@ -843,7 +843,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   int guaranteed_consts = 0;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
-      "_encapsulate", "", graph_before,
+      "_encapsulate", graph_before,
       /*rewrite_subgraph_fn=*/
       [&guaranteed_consts](const std::vector<OutputTensor>& arg_source_tensors,
                            std::unique_ptr<Graph>* graph_ptr,
@@ -1109,7 +1109,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
              absl::Span<const string>(
                  {"_xla_token_arg_node",
                   "outside_compilation_O1_host_compute"})}},
-           {"F"}},
+           {"F", "outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"C:o:0", "D:o:0"},
@@ -1990,7 +1990,8 @@ TEST(EncapsulateSubgraphsTest,
             {"_xla_token_input_nodes",
              absl::Span<const string>(
                  {"_xla_token_arg_node",
-                  "outside_compilation_O1_host_compute"})}}},
+                  "outside_compilation_O1_host_compute"})}},
+           {"outside_compilation_O1_host_compute"}},
       },
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
@@ -2117,7 +2118,8 @@ TEST(EncapsulateSubgraphsTest,
             {"_xla_token_input_nodes",
              absl::Span<const string>(
                  {"_xla_token_arg_node",
-                  "outside_compilation_O1_host_compute"})}}},
+                  "outside_compilation_O1_host_compute"})}},
+           {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"D:o:0"},
@@ -2267,7 +2269,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"_xla_token_input_nodes",
           absl::Span<const string>(
               {"_xla_token_arg_node", "outside_compilation_O1_host_compute"})}},
-        {}},
+        {"outside_compilation_O1_host_compute"}},
        {{"outside_compilation_O3_host_compute"},
         "XlaHostCompute",
         {"D:o:0"},
@@ -2282,7 +2284,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
           absl::Span<const string>({"_xla_token_arg_node",
                                     "outside_compilation_O1_host_compute",
                                     "outside_compilation_O2_host_compute"})}},
-        {}}},
+        {"outside_compilation_O1_host_compute",
+         "outside_compilation_O2_host_compute"}}},
       {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
        {"h_0_retval_retval", "H:o:0"}});
 
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 4e65971191a..2c2cd094133 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -231,9 +231,9 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 
   auto output = absl::make_unique<Graph>((*graph)->op_registry());
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      EncapsulateSubgraphsInFunctions(
-          kXlaClusterAttr, "", **graph, RewriteSubgraph,
-          /*reuse_existing_functions=*/true, &output, flib_def),
+      EncapsulateSubgraphsInFunctions(kXlaClusterAttr, **graph, RewriteSubgraph,
+                                      /*reuse_existing_functions=*/true,
+                                      &output, flib_def),
       "EncapsulateXlaComputationsPass failed");
   graph->swap(output);
   return Status::OK();
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index a6e66657fb5..0667de9d230 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -393,7 +393,7 @@ Status ValidateOutsideCompilationCallNode(Node* call_node) {
 // Replace outside compilation function call node with XlaHostCompute node.
 // If the function call node has no input/output edges, we will just remove it
 // and not create a XlaHostCompute node.
-Status ReplaceOrRemoveOutsideCompilationCallNode(
+xla::StatusOr<Node*> ReplaceOrRemoveOutsideCompilationCallNode(
     Graph* g, Node* call_node, const std::map<string, int>& host_compute_core,
     const absl::flat_hash_map<string, std::vector<string>>& cluster_deps) {
   // If the function call node has no input/output edges, just remove it.
@@ -413,7 +413,7 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
   if (!has_edge) {
     VLOG(4) << "Did not add HostCompute node for " << call_node->DebugString();
     g->RemoveNode(call_node);
-    return Status::OK();
+    return nullptr;
   }
 
   // Build XlaHostCompute NodeDef.
@@ -424,7 +424,7 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
                       ReplaceNode(g, call_node, node_def));
   VLOG(4) << "Added HostCompute node: " << host_compute_node->DebugString();
 
-  return Status::OK();
+  return host_compute_node;
 }
 
 // Resets "device_ordinal" attr to placeholder value for related nodes
@@ -1634,7 +1634,7 @@ Status ExtractOutsideCompilationForFunction(
   RewriteOutsideCompilationSubgraphFn rewrite_fn(
       xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name);
   TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions(
-      outside_compilation_attr_name, "", *fbody->graph, rewrite_fn,
+      outside_compilation_attr_name, *fbody->graph, rewrite_fn,
       /*reuse_existing_functions=*/true, &graph_out, fld));
 
   // Replace outside_compilation function nodes with HostCompute ops.
@@ -1670,10 +1670,35 @@ Status ExtractOutsideCompilationForFunction(
       }
     }
   }
+  std::map<string, Node*> host_compute_nodes;
   for (Node* n : outside_compilation_nodes) {
     TF_RETURN_IF_ERROR(ValidateOutsideCompilationCallNode(n));
-    TF_RETURN_IF_ERROR(ReplaceOrRemoveOutsideCompilationCallNode(
-        graph_out.get(), n, host_compute_core, *cluster_deps));
+    auto host_compute_node_or = ReplaceOrRemoveOutsideCompilationCallNode(
+        graph_out.get(), n, host_compute_core, *cluster_deps);
+    TF_RETURN_IF_ERROR(host_compute_node_or.status());
+    Node* host_compute_node = host_compute_node_or.ValueOrDie();
+    if (host_compute_node) {
+      host_compute_nodes[host_compute_node->name()] = host_compute_node;
+    }
+  }
+  // For XlaHostCompute nodes with dependencies, add control edges between them
+  // so XlaCompiler can handle them in correct order.
+  for (auto iter : host_compute_nodes) {
+    Node* host_compute_node = iter.second;
+    std::vector<string> token_input_node_names;
+    TF_RETURN_IF_ERROR(GetNodeAttr(host_compute_node->def(),
+                                   kXlaTokenInputNodesAttrName,
+                                   &token_input_node_names));
+    for (const string& node_name : token_input_node_names) {
+      if (node_name == kXlaTokenArgNodeName) {
+        continue;
+      }
+
+      auto iter = host_compute_nodes.find(node_name);
+      if (iter != host_compute_nodes.end()) {
+        graph_out->AddControlEdge(iter->second, host_compute_node);
+      }
+    }
   }
 
   // Handle nodes with associated functions.
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index 93817378e96..2717487c78e 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -990,6 +990,16 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_1);
+
+  // Check there is a control edge from host_compute_0 to host_compute_1.
+  bool has_control_edge = false;
+  for (const Edge *e : host_compute_1->in_edges()) {
+    if (e->IsControlEdge() && e->src() == host_compute_0) {
+      has_control_edge = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(has_control_edge);
 }
 
 TEST_F(ExtractOutsideCompilationForFunctionTest,
@@ -1062,5 +1072,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
                           "_xla_token_input_nodes", &token_input_nodes));
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_1);
+
+  // Check there is a control edge from host_compute_0 to host_compute_1.
+  bool has_control_edge = false;
+  for (const Edge *e : host_compute_1->in_edges()) {
+    if (e->IsControlEdge() && e->src() == host_compute_0) {
+      has_control_edge = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(has_control_edge);
 }
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/jit/graphcycles/BUILD
index f9be7c45743..69c67c87615 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/jit/graphcycles/BUILD
@@ -1,9 +1,8 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [
         "//tensorflow/compiler/tf2xla:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 3524da23fb3..0a65529cdb9 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -1,9 +1,8 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [
         "//tensorflow/compiler/tf2xla:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 cc_library(
@@ -29,6 +28,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:tf_allocator_adapter",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
     ],
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 6df0991e354..e825a77b1d1 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -61,7 +61,7 @@ XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
   DeviceType device_type = ctx->device_type();
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
-  std::unique_ptr<XlaAllocator> xla_allocator;
+  std::unique_ptr<se::TfAllocatorAdapter> xla_allocator;
   se::DeviceMemoryAllocator* device_allocator = nullptr;
 
   if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
@@ -93,7 +93,7 @@ XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
         se::MultiPlatformManager::PlatformWithId(platform_id);
     OP_REQUIRES_OK_RETURN(ctx, XlaPlatformInfo(), maybe_platform.status());
 
-    xla_allocator = absl::make_unique<XlaAllocator>(
+    xla_allocator = absl::make_unique<se::TfAllocatorAdapter>(
         maybe_platform.ValueOrDie(), ctx->device()->GetAllocator({}));
   }
 
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index eaa686780e4..3a1009ec8a7 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
 
 namespace tensorflow {
 
@@ -36,11 +37,11 @@ class XlaPlatformInfo {
  public:
   XlaPlatformInfo() : device_type_("") {}
   XlaPlatformInfo(XlaPlatformInfo&&) = default;
-  explicit XlaPlatformInfo(const DeviceType device_type,
-                           se::Platform::Id platform_id,
-                           const XlaDevice::Metadata* xla_device_metadata,
-                           std::unique_ptr<XlaAllocator> xla_allocator,
-                           se::DeviceMemoryAllocator* device_allocator)
+  explicit XlaPlatformInfo(
+      const DeviceType device_type, se::Platform::Id platform_id,
+      const XlaDevice::Metadata* xla_device_metadata,
+      std::unique_ptr<se::TfAllocatorAdapter> xla_allocator,
+      se::DeviceMemoryAllocator* device_allocator)
       : device_type_(device_type),
         platform_id_(platform_id),
         xla_device_metadata_(xla_device_metadata),
@@ -84,8 +85,8 @@ class XlaPlatformInfo {
   // then device_allocator_ is the xla::Backend's memory allocator and
   // xla_allocator_ is null.  If the op is placed on a regular CPU or GPU device
   // then device_allocator_ is null and xla_allocator_ points to an appropriate
-  // XlaAllocator instance.
-  std::unique_ptr<XlaAllocator> xla_allocator_;
+  // se::TfAllocatorAdapter instance.
+  std::unique_ptr<se::TfAllocatorAdapter> xla_allocator_;
   se::DeviceMemoryAllocator* device_allocator_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 4142de56813..81ffea31c30 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -229,32 +229,18 @@ class MarkForCompilationPassImpl {
   // Initialize some internal data structures.
   Status Initialize();
 
-  // Runs through all the nodes in `cycles_graph_` and tries to create clusters.
-  // Returns true if any new clusters were created.
-  StatusOr<bool> RunEdgeContractionLoopInPostOrderOnce();
+  // Runs through the entire cluster graph in post-order and calls `fn(from,
+  // to)` on each edge.  `fn(from, to)` is expected to return true if it was
+  // able to contract `from`->`to`.
+  //
+  // Returns true if `fn` returned true for any edge.
+  template <typename FnTy>
+  StatusOr<bool> ForEachEdgeInPostOrder(FnTy fn);
 
-  // Runs through all the nodes in `cycles_graph_` and tries to contract high
-  // priority edges for clusters. Returns true if any new clusters were created.
-  //
-  // There are potentially many maximal clustering results, but they will not
-  // all be equally performant. Some clustering decision are likely to improve
-  // performance much more than others, and we cannot order contractions on this
-  // cost function, nor can we look at global information while deciding on
-  // individual edges to contract. Instead, we will make decisions on these
-  // important edges then make decisions on all other edges, causing the highest
-  // chance of all most important edges to be contracted.
-  //
-  // An example of where this might occur is with a digraph:
-  // {A -> B, B -> C, A -> X, X -> C} where B is a Size operation and X is
-  // not-compilable. In this case, the valid clusterings are {A,B} or {B,C}. B
-  // should be clustered with A because it will prevent a potentially large
-  // tensor from A being computed and copied.
-  //
-  // This pass will ensure that contraction happens, which cannot be enforced in
-  // a single pass with the current algorithm.
-  // graph and prevent B->C from being clusterd in anticipation of a later A->B
-  // cluster.
-  StatusOr<bool> ContractPreferredEdges();
+  // If from->to is a "preferred" edge (i.e. if we have a choice, we want to
+  // prioritize contracting from->to over contracting other edges) then
+  // contracts it and returns true.  Else returns false.
+  StatusOr<bool> ContractEdgeIfPreferred(Cluster* from, Cluster* to);
 
   // Contracts as many edges as possible to create XLA clusters.  After this
   // finishes the clustering decisions made are implicitly stored in
@@ -276,10 +262,6 @@ class MarkForCompilationPassImpl {
   // true if successful.
   StatusOr<bool> TryToContractEdge(Cluster* from, Cluster* to);
 
-  // Tries to contract each edge from `cluster_from`.  Returns true if any edges
-  // were contracted, false otherwise.
-  StatusOr<bool> TryToContractEdgesFrom(Cluster* cluster_from);
-
   // Nodes that XLA can compile are put in `compilation_candidates_`.
   Status FindCompilationCandidates();
 
@@ -401,6 +383,13 @@ class MarkForCompilationPassImpl {
     return true;
   }
 
+  string EdgeContractionFailureMsg(Cluster* from, Cluster* to,
+                                   absl::string_view reason) {
+    return absl::StrCat("Could not contract ", from->DebugString(*graph_),
+                        " -> ", to->DebugString(*graph_), " because ", reason,
+                        ".");
+  }
+
   DebugOptions debug_options_;
   Graph* graph_;
   FunctionLibraryDefinition* flib_def_;
@@ -611,7 +600,8 @@ Status MarkForCompilationPassImpl::Initialize() {
   return BuildInitialClusterSet();
 }
 
-StatusOr<bool> MarkForCompilationPassImpl::ContractPreferredEdges() {
+template <typename FnTy>
+StatusOr<bool> MarkForCompilationPassImpl::ForEachEdgeInPostOrder(FnTy fn) {
   bool changed = false;
   for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
     Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
@@ -632,55 +622,33 @@ StatusOr<bool> MarkForCompilationPassImpl::ContractPreferredEdges() {
         continue;
       }
 
-      if (cluster_to->cluster_size() == 1) {
-        Node* n = graph_->FindNodeId(cluster_to->GetIdOfOnlyNode());
-
-        // Shape consuming operations are desirable to cluster with their
-        // operands because they return a small set of scalar values after
-        // consuming a large amount of data. For example, given a graph X -> Y
-        // -> Size -> Z, where the possible clustering is [{X, Y, Size}, {Z}] or
-        // [{X, Y}, {Size, Z}], the better clustering is Size with Y because the
-        // output of size will be a small tensor while Y is a potentially large
-        // tensor that must be computed and possible transposed/copied before
-        // the second cluster executes.
-        if (IsShapeConsumerOp(*n)) {
-          TF_ASSIGN_OR_RETURN(bool contracted_edge,
-                              TryToContractEdge(cluster_from, cluster_to));
-          changed |= contracted_edge;
-        }
-      }
+      TF_ASSIGN_OR_RETURN(bool contracted_edge, fn(cluster_from, cluster_to));
+      changed |= contracted_edge;
     }
   }
 
   return changed;
 }
 
-StatusOr<bool>
-MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
-  bool changed = false;
-  // Iterating over the graph once in post-order is sufficient to produce a
-  // maximal clustering:
-  //
-  // A. We visit a cluster only after maximally clustering all its children.
-  // B. By the time we're done with `node` (in `TryToContractEdgesFrom`) all of
-  //    its children that could have been absorbed into `node` have been
-  //    absorbed.
-  // C. We have an invariant that making a cluster larger does not make edges
-  //    leaving it more contractable. That is, if we have
-  //    digraph { X->Y; Y->Z; } then collapsing X->Y does not make it possible
-  //    to contract Y->Z if Y->Z was not contractible originally.
-  for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
-    Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
-    if (!cluster_from) {
-      continue;
-    }
+StatusOr<bool> MarkForCompilationPassImpl::ContractEdgeIfPreferred(
+    Cluster* from, Cluster* to) {
+  if (to->cluster_size() == 1) {
+    Node* n = graph_->FindNodeId(to->GetIdOfOnlyNode());
 
-    TF_ASSIGN_OR_RETURN(bool contracted_one_edge,
-                        TryToContractEdgesFrom(cluster_from));
-    changed |= contracted_one_edge;
+    // Shape consuming operations are desirable to cluster with their
+    // operands because they return a small set of scalar values after
+    // consuming a large amount of data. For example, given a graph X -> Y
+    // -> Size -> Z, where the possible clustering is [{X, Y, Size}, {Z}] or
+    // [{X, Y}, {Size, Z}], the better clustering is Size with Y because the
+    // output of size will be a small tensor while Y is a potentially large
+    // tensor that must be computed and possible transposed/copied before
+    // the second cluster executes.
+    if (IsShapeConsumerOp(*n)) {
+      return TryToContractEdge(from, to);
+    }
   }
 
-  return changed;
+  return false;
 }
 
 Status MarkForCompilationPassImpl::RunEdgeContractionLoop() {
@@ -694,25 +662,68 @@ Status MarkForCompilationPassImpl::RunEdgeContractionLoop() {
   // without restrictions. This helps to minimize data output from clusters (and
   // possible transpose operations before outputs) that might occur if a
   // ShapeConsumingOp is on the edge of 2 clusters due to cycle considerations.
-  TF_ASSIGN_OR_RETURN(bool changed, ContractPreferredEdges());
+  //
+  // There are potentially many maximal clustering results, but they will not
+  // all be equally performant. Some clustering decision are likely to improve
+  // performance much more than others, and we cannot order contractions on this
+  // cost function, nor can we look at global information while deciding on
+  // individual edges to contract. Instead, we will make decisions on these
+  // important edges then make decisions on all other edges, causing the highest
+  // chance of all most important edges to be contracted.
+  //
+  // An example of where this might occur is with a digraph:
+  // {A -> B, B -> C, A -> X, X -> C} where B is a Size operation and X is
+  // not-compilable. In this case, the valid clusterings are {A,B} or {B,C}. B
+  // should be clustered with A because it will prevent a potentially large
+  // tensor from A being computed and copied.
+  //
+  // This pass will ensure that contraction happens, which cannot be enforced in
+  // a single pass with the current algorithm.
+  // graph and prevent B->C from being clusterd in anticipation of a later A->B
+  // cluster.
 
-  TF_ASSIGN_OR_RETURN(changed, RunEdgeContractionLoopInPostOrderOnce());
+  TF_ASSIGN_OR_RETURN(bool changed,
+                      ForEachEdgeInPostOrder([&](Cluster* from, Cluster* to) {
+                        return ContractEdgeIfPreferred(from, to);
+                      }));
 
-  // Check that RunEdgeContractionLoopInPostOrderOnce is idempotent.  Once the
-  // linear time post-order scheme has been battle tested we can move this to
-  // happen only in debug builds.
-  TF_ASSIGN_OR_RETURN(changed, RunEdgeContractionLoopInPostOrderOnce());
+  // Iterating over the whole graph once in post-order is sufficient to produce
+  // a maximal clustering:
+  //
+  // A. We visit a cluster only after maximally clustering all its children.
+  // B. By the time we're done with `node` (in `TryToContractEdgesFrom`) all of
+  //    its children that could have been absorbed into `node` have been
+  //    absorbed.
+  // C. We have an invariant that making a cluster larger does not make edges
+  //    leaving it more contractable. That is, if we have
+  //    digraph { X->Y; Y->Z; } then collapsing X->Y does not make it possible
+  //    to contract Y->Z if Y->Z was not contractible originally.
+  TF_ASSIGN_OR_RETURN(changed,
+                      ForEachEdgeInPostOrder([&](Cluster* from, Cluster* to) {
+                        return TryToContractEdge(from, to);
+                      }));
+
+  // Check that the conclusion made above (that iterating over the graph once in
+  // post order gives a maximal clustering) holds.  Once the linear time
+  // post-order scheme has been battle tested we can move this to happen only in
+  // debug builds.
+  TF_ASSIGN_OR_RETURN(changed,
+                      ForEachEdgeInPostOrder([&](Cluster* from, Cluster* to) {
+                        return TryToContractEdge(from, to);
+                      }));
   TF_RET_CHECK(!changed);
 
   return Status::OK();
 }
 
+std::atomic<int64> cluster_sequence_num;
+
+int64 GetNextClusterSequenceNumber() { return cluster_sequence_num++; }
+
 Status MarkForCompilationPassImpl::CreateClusters() {
   TF_RET_CHECK(initialized_ && edges_contracted_ && !clusters_created_);
   clusters_created_ = true;
 
-  static std::atomic<int64> cluster_sequence_num;
-
   // Names for each cluster.
   std::unordered_map<int, string> cluster_names;
 
@@ -745,7 +756,7 @@ Status MarkForCompilationPassImpl::CreateClusters() {
       string& name = cluster_names[cluster->cycles_graph_node_id()];
 
       if (name.empty()) {
-        name = absl::StrCat("cluster_", cluster_sequence_num++);
+        name = absl::StrCat("cluster_", GetNextClusterSequenceNumber());
       }
 
       n->AddAttr(kXlaClusterAttr, name);
@@ -1065,8 +1076,7 @@ bool MarkForCompilationPassImpl::CompilationDisallowedByXlaCompileAttr(
 
 bool MarkForCompilationPassImpl::LogNotContractableAndReturnFalse(
     Cluster* from, Cluster* to, absl::string_view reason) {
-  VLOG(3) << "Could not contract " << from->DebugString(*graph_) << " -> "
-          << to->DebugString(*graph_) << " because " << reason << ".";
+  VLOG(3) << EdgeContractionFailureMsg(from, to, reason);
   return false;
 }
 
@@ -1075,8 +1085,14 @@ StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdge(Cluster* from,
   DCHECK(from->deadness_predicate().has_value() ==
          to->deadness_predicate().has_value());
   if (from->deadness_predicate() != to->deadness_predicate()) {
-    return LogNotContractableAndReturnFalse(
-        from, to, "the two nodes have mismatching deadness");
+    VLOG(3) << EdgeContractionFailureMsg(
+        from, to,
+        absl::StrCat(
+            "the two nodes have mismatching deadness: ",
+            deadness_analysis_->DebugString(*from->deadness_predicate()),
+            " and ",
+            deadness_analysis_->DebugString(*to->deadness_predicate())));
+    return false;
   }
 
   TF_ASSIGN_OR_RETURN(bool devices_compatible,
@@ -1133,32 +1149,6 @@ StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdge(Cluster* from,
   return MergeClusters(from, to);
 }
 
-StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdgesFrom(
-    Cluster* cluster_from) {
-  bool changed = false;
-
-  // Make a copy of the set of successors because we may modify the graph in
-  // TryToContractEdge.
-  std::vector<int32> successors_copy =
-      cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
-
-  for (int to : successors_copy) {
-    iteration_count_++;
-
-    Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
-    if (!cluster_to) {
-      continue;
-    }
-
-    TF_ASSIGN_OR_RETURN(bool contracted_edge,
-                        TryToContractEdge(cluster_from, cluster_to));
-
-    changed |= contracted_edge;
-  }
-
-  return changed;
-}
-
 Status MarkForCompilationPassImpl::Run() {
   // Make sure that kernels have been registered on the JIT device.
   XlaOpRegistry::RegisterCompilationKernels();
@@ -1485,7 +1475,8 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
   op_filter.allow_control_trigger = true;
   op_filter.allow_eliding_assert_and_checknumerics_ops = true;
   op_filter.allow_ops_producing_or_consuming_variant = true;
-  op_filter.allow_slow_and_inaccurate_ops = true;
+  op_filter.allow_slow_ops = true;
+  op_filter.allow_inaccurate_ops = true;
 
   return RecursiveCompilabilityChecker{&op_filter, &jit_device_type}
       .IsCompilableCall(ndef, flr);
@@ -1522,4 +1513,8 @@ Status MarkForCompilationPass::RunForTest(
 
   return MarkForCompilation(options, debug_options);
 }
+
+namespace testing {
+void ResetClusterSequenceNumber() { cluster_sequence_num = 0; }
+}  // namespace testing
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h
index 16b8427b60e..2eee144e645 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h
@@ -51,6 +51,13 @@ class MarkForCompilationPass : public GraphOptimizationPass {
 // function is compilable iff every operator in the function body is
 // compilable.
 bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef);
+
+namespace testing {
+// DO NOT USE IN PRODUCTION.
+//
+// Resets some internal state to let us write reliable unit tests.
+void ResetClusterSequenceNumber();
+}  // namespace testing
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index 64409d93347..3b7a74ec780 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -1,7 +1,6 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
diff --git a/tensorflow/compiler/jit/test_util.cc b/tensorflow/compiler/jit/test_util.cc
index cada272090a..f50ecdf2287 100644
--- a/tensorflow/compiler/jit/test_util.cc
+++ b/tensorflow/compiler/jit/test_util.cc
@@ -49,7 +49,7 @@ Status ShapeAnnotationsMatch(
       missing.push_back(entry.first);
     }
     return errors::InvalidArgument("Missing shapes for nodes: ",
-                                   str_util::Join(missing, ","));
+                                   absl::StrJoin(missing, ","));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 19e3793f29b..fbfda449ebd 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -60,7 +60,8 @@ Status XlaCpuDeviceFactory::CreateDevices(
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
-  registration.cluster_slow_and_inaccurate_ops = true;
+  registration.cluster_slow_ops = true;
+  registration.cluster_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_CPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index 3b9c4160b95..ff5d5d38e8c 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -71,7 +71,7 @@ class XlaDeviceContext : public DeviceContext {
                               StatusCallback done) const override;
 
   xla::LocalClient* client() const { return client_; }
-  se::Stream* stream() const { return stream_.get(); }
+  se::Stream* stream() const override { return stream_.get(); }
   se::Stream* host_to_device_stream() const {
     return host_to_device_stream_.get();
   }
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 293ea3997cc..68c8b64cc82 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/host_constant_op.h"
 #include "tensorflow/core/kernels/identity_n_op.h"
 #include "tensorflow/core/kernels/identity_op.h"
+#include "tensorflow/core/kernels/logging_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/queue_op.h"
 #include "tensorflow/core/kernels/resource_variable_ops.h"
@@ -81,6 +82,11 @@ class XlaAssignVariableOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(Name("_XlaRun").Device(DEVICE), KERNEL);
 
 #define REGISTER_XLA_DEVICE_KERNELS(DEVICE, TYPES)                             \
+  REGISTER_KERNEL_BUILDER(Name("Assert")                                       \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("condition")                         \
+                              .HostMemory("data"),                             \
+                          AssertOp);                                           \
   REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE), SendOp);               \
   REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE), RecvOp);               \
   REGISTER_KERNEL_BUILDER(                                                     \
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 913612f9a6c..02eed3ee16f 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -95,7 +95,8 @@ Status XlaGpuDeviceFactory::CreateDevices(
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
-  registration.cluster_slow_and_inaccurate_ops = true;
+  registration.cluster_slow_ops = true;
+  registration.cluster_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_GPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 4252e2e24ac..f720183e196 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -63,7 +63,8 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
   registration.cluster_control_trigger = true;
   registration.elide_assert_and_checknumerics = true;
   registration.cluster_variant_ops = true;
-  registration.cluster_slow_and_inaccurate_ops = true;
+  registration.cluster_slow_ops = true;
+  registration.cluster_inaccurate_ops = true;
   XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
                                            registration);
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 3bb698b33d6..d66c80fea90 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -167,32 +167,6 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
   return Status::OK();
 }
 
-XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
-    : se::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
-
-XlaAllocator::~XlaAllocator() {}
-
-xla::StatusOr<se::OwningDeviceMemory> XlaAllocator::Allocate(
-    int device_ordinal, uint64 size, bool retry_on_failure) {
-  AllocationAttributes attrs;
-  attrs.no_retry_on_failure = !retry_on_failure;
-  void* data = nullptr;
-  if (size != 0) {
-    data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs);
-    if (data == nullptr) {
-      return errors::ResourceExhausted(
-          "Out of memory while trying to allocate ", size, " bytes.");
-    }
-  }
-  return se::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
-                                device_ordinal, this);
-}
-
-Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
-  wrapped_->DeallocateRaw(mem.opaque());
-  return Status::OK();
-}
-
 XlaComputationLaunchContext::XlaComputationLaunchContext(
     xla::LocalClient* client, se::DeviceMemoryAllocator* xla_allocator,
     bool allocate_xla_tensors, bool use_multiple_streams)
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 4cb020ffe34..429ff0a065c 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace tensorflow {
-class XlaAllocator;
 
 // Struct that represents a possibly-absent Tensor.
 struct OptionalTensor {
@@ -104,74 +103,6 @@ class VariableInfo {
 Status LockVariables(absl::Span<VariableInfo> variables)
     EXCLUSIVE_LOCK_FUNCTION();
 
-// Adapter class that wraps a Tensorflow allocator as an XLA allocator.
-// Assumes that the Tensorflow allocator permits asynchronous deallocation:
-// see comment on `AllowsAsynchronousDeallocation()`.
-class XlaAllocator : public se::DeviceMemoryAllocator {
- public:
-  XlaAllocator(const se::Platform* platform, Allocator* wrapped);
-  ~XlaAllocator() override;
-  xla::StatusOr<se::OwningDeviceMemory> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure) override;
-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
-
-  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
-  // before GPU execution takes place. Tensorflow uses the ordering of the main
-  // compute stream to enforce a happens-before relationship between a memory
-  // allocation and code that reuses the same memory. If Tensorflow adds
-  // support for multiple GPU streams or allocators with different ordering
-  // requirements, this code may need to change.
-  // (This attribute has no effect on CPU.)
-  bool AllowsAsynchronousDeallocation() const override { return true; }
-
- private:
-  Allocator* wrapped_;
-};
-
-// Adapter class that wraps per-device TF allocators as an XLA allocator.
-// Assumes that the Tensorflow allocator permits asynchronous deallocation;
-// see comment on `AllowsAsynchronousDeallocation()`.
-class MultiDeviceAdapter : public se::DeviceMemoryAllocator {
- public:
-  MultiDeviceAdapter(
-      const se::Platform* platform,
-      std::vector<std::unique_ptr<tensorflow::Allocator>> tf_allocators)
-      : DeviceMemoryAllocator(platform),
-        tf_allocators_(std::move(tf_allocators)) {
-    for (const auto& tf_allocator : tf_allocators_) {
-      per_device_allocators_.emplace_back(platform, tf_allocator.get());
-    }
-  }
-
-  xla::StatusOr<se::OwningDeviceMemory> Allocate(
-      int device_ordinal, uint64 size, bool retry_on_failure) override {
-    CHECK_LT(device_ordinal, per_device_allocators_.size());
-    return per_device_allocators_[device_ordinal].Allocate(device_ordinal, size,
-                                                           retry_on_failure);
-  }
-
-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
-    CHECK_LT(device_ordinal, per_device_allocators_.size());
-    return per_device_allocators_[device_ordinal].Deallocate(device_ordinal,
-                                                             mem);
-  }
-
-  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
-  // before GPU execution takes place. Tensorflow uses the ordering of the main
-  // compute stream to enforce a happens-before relationship between a memory
-  // allocation and code that reuses the same memory. If Tensorflow adds
-  // support for multiple GPU streams or allocators with different ordering
-  // requirements, this code may need to change.
-  // (This attribute has no effect on CPU.)
-  bool AllowsAsynchronousDeallocation() const override { return true; }
-
- private:
-  std::vector<tensorflow::XlaAllocator> per_device_allocators_;
-  // The wrapped TF allocators backing per_device_allocators_ (XlaAllocator does
-  // not take ownership of its underlying Allocator).
-  std::vector<std::unique_ptr<tensorflow::Allocator>> tf_allocators_;
-};
-
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
 class XlaComputationLaunchContext {
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index 238fd15166c..c2ba5cb3ecd 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -28,10 +28,9 @@
   ** Please don't remove this file - it is supporting some 3rd party plugins **
 """
 
-licenses(["notice"])
-
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index fbb60d17316..43dbab1e9a7 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -954,7 +954,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "ternary_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["ternary_ops_test.py"],
     deps = [
         ":xla_test",
diff --git a/tensorflow/compiler/tests/cond_test.py b/tensorflow/compiler/tests/cond_test.py
index 5963020bbb7..a28c2c5ca88 100644
--- a/tensorflow/compiler/tests/cond_test.py
+++ b/tensorflow/compiler/tests/cond_test.py
@@ -19,11 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.client import session
 from tensorflow.python.compiler.xla import xla
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -46,8 +48,8 @@ class CondTest(xla_test.XLATestCase):
       def f():
         ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=1)
         output = control_flow_ops.cond(
-            constant_op.constant(
-                True), lambda: ta.write(0, 5.), lambda: ta.write(0, 10.))
+            constant_op.constant(True),
+            lambda: ta.write(0, 5.), lambda: ta.write(0, 10.))
 
         return output.stack()
 
@@ -56,6 +58,46 @@ class CondTest(xla_test.XLATestCase):
 
       xla_context.Exit()
 
+  def testCondAndTensorArrayInDefun_constFolding(self):
+    g = ops.Graph()
+    with session.Session(graph=g), g.as_default(), self.test_scope():
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+
+      @function.defun
+      def f():
+        ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=1)
+        output = control_flow_ops.cond(
+            constant_op.constant(False),
+            lambda: ta.write(0, 5.), lambda: ta.write(0, 10.))
+
+        return output.stack()
+
+      output_t = f()
+      self.assertAllEqual([10.], self.evaluate(output_t))
+
+      xla_context.Exit()
+
+  def testCondAndTensorArray_xlaCompile(self):
+    self.skipTest("b/127846988")
+    # Fails with "Uninitialized arguments" in XlaIfOp::Compile
+    with self.session(), self.test_scope():
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+
+      def f():
+        ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=1)
+        output = control_flow_ops.cond(
+            constant_op.constant(True),
+            lambda: ta.write(0, 5.), lambda: ta.write(0, 10.))
+
+        return output.stack()
+
+      output_t, = xla.compile(f)
+      self.assertAllEqual([5.], self.evaluate(output_t))
+
+      xla_context.Exit()
+
   def testCondConstPropagation(self):
     with self.session() as sess, self.test_scope():
       xla_context = control_flow_ops.XLAControlFlowContext()
@@ -199,6 +241,28 @@ class CondTest(xla_test.XLATestCase):
 
       xla_context.Exit()
 
+  def testSwitchCaseAndTensorArray_xlaCompile(self):
+    self.skipTest("b/127846988")
+    with self.session(), self.test_scope():
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+
+      def f():
+        ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=1)
+        output = control_flow_ops.switch_case(
+            constant_op.constant(1), {
+                0: lambda: ta.write(0, 5.),
+                1: lambda: ta.write(0, 10.),
+                2: lambda: ta.write(0, 15.),
+            })
+
+        return output.stack()
+
+      output_t, = xla.compile(f)
+      self.assertAllEqual([10.], self.evaluate(output_t))
+
+      xla_context.Exit()
+
   def testSwitchCaseConstPropagation(self):
     self.skipTest("b/127846988")
     with self.session() as sess, self.test_scope():
diff --git a/tensorflow/compiler/tests/extract_image_patches_op_test.py b/tensorflow/compiler/tests/extract_image_patches_op_test.py
index 9e9b1f367e2..d0686c4bcb8 100644
--- a/tensorflow/compiler/tests/extract_image_patches_op_test.py
+++ b/tensorflow/compiler/tests/extract_image_patches_op_test.py
@@ -130,5 +130,20 @@ class ExtractImagePatches(xla_test.XLATestCase):
         padding="VALID",
         patches=patches)
 
+  def testKsize2x2Stride1x1Rate1x1ValidDepth2(self):
+    """Test for 2x2 kernel with VALID padding."""
+    # [1, 2, 2, 2]
+    image = [[[[1, 5], [2, 6]], [[3, 7], [4, 8]]]]
+    # [1, 1, 1, 8]
+    patches = [[[[1, 5, 2, 6, 3, 7, 4, 8]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="VALID",
+        patches=patches)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index 7e8edc5f0b1..200851ee500 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -72,21 +72,21 @@ class TernaryOpsTest(xla_test.XLATestCase):
     for dtype in self.numeric_types:
       self._testTernary(
           array_ops.where,
-          np.array(0, dtype=np.bool),
+          np.array(False),
           np.array(2, dtype=dtype),
           np.array(7, dtype=dtype),
           expected=np.array(7, dtype=dtype))
 
       self._testTernary(
           array_ops.where,
-          np.array(1, dtype=np.bool),
+          np.array(True),
           np.array([1, 2, 3, 4], dtype=dtype),
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array([1, 2, 3, 4], dtype=dtype))
 
       self._testTernary(
           array_ops.where,
-          np.array(0, dtype=np.bool),
+          np.array(False),
           np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype),
           np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
           expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype))
@@ -105,6 +105,74 @@ class TernaryOpsTest(xla_test.XLATestCase):
           np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
           expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=dtype))
 
+  def testSelectV2(self):
+    for dtype in self.numeric_types:
+      self._testTernary(
+          array_ops.where_v2,
+          np.array(False),
+          np.array(2, dtype=dtype),
+          np.array(7, dtype=dtype),
+          expected=np.array(7, dtype=dtype))
+
+      self._testTernary(
+          array_ops.where_v2,
+          np.array(True),
+          np.array([1, 2, 3, 4], dtype=dtype),
+          np.array([5, 6, 7, 8], dtype=dtype),
+          expected=np.array([1, 2, 3, 4], dtype=dtype))
+
+      self._testTernary(
+          array_ops.where_v2,
+          np.array(False),
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype),
+          np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
+          expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype))
+
+      self._testTernary(
+          array_ops.where_v2,
+          np.array([0, 1, 1, 0], dtype=np.bool),
+          np.array([1, 2, 3, 4], dtype=dtype),
+          np.array([5, 6, 7, 8], dtype=dtype),
+          expected=np.array([5, 2, 3, 8], dtype=dtype))
+
+      # Broadcast the condition
+      self._testTernary(
+          array_ops.where_v2,
+          np.array([0, 1], dtype=np.bool),
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype),
+          np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
+          expected=np.array([[7, 2], [9, 4], [11, 6]], dtype=dtype))
+
+      # Broadcast the then branch to the else
+      self._testTernary(
+          array_ops.where_v2,
+          np.array([[0, 1], [1, 0], [1, 1]], dtype=np.bool),
+          np.array([[1, 2]], dtype=dtype),
+          np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
+          expected=np.array([[7, 2], [1, 10], [1, 2]], dtype=dtype))
+
+      # Broadcast the else branch to the then
+      self._testTernary(
+          array_ops.where_v2,
+          np.array([[1, 0], [0, 1], [0, 0]], dtype=np.bool),
+          np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype),
+          np.array([[1, 2]], dtype=dtype),
+          expected=np.array([[7, 2], [1, 10], [1, 2]], dtype=dtype))
+
+      # Broadcast the then/else branches to the condition
+      self._testTernary(
+          array_ops.where_v2,
+          np.array([[1, 0], [0, 1], [1, 1]], dtype=np.bool),
+          np.array(7, dtype=dtype),
+          np.array(8, dtype=dtype),
+          expected=np.array([[7, 8], [8, 7], [7, 7]], dtype=dtype))
+      self._testTernary(
+          array_ops.where_v2,
+          np.array([[1, 0], [0, 1], [0, 0]], dtype=np.bool),
+          np.array(7, dtype=dtype),
+          np.array([8, 9], dtype=dtype),
+          expected=np.array([[7, 9], [8, 7], [8, 9]], dtype=dtype))
+
   def testSlice(self):
     for dtype in self.numeric_types:
       self._testTernary(
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 723eba7eb96..34d4ee79542 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -104,6 +104,24 @@ struct EdgePtrCompare {
   }
 };
 
+// TODO(laigd): instead of deciding the device here, the converter should accept
+// a device name as one of the conversion parameter so users can control on
+// which device they want to run the conversion.
+std::pair<TfGpuId, PlatformGpuId> GetFirstValidDeviceId() {
+  for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
+    TfGpuId tf_gpu_id(tf_gpu_id_value);
+    PlatformGpuId platform_gpu_id;
+    Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
+    if (s.ok()) {
+      VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
+              << platform_gpu_id.value();
+      return std::make_pair(tf_gpu_id, platform_gpu_id);
+    }
+  }
+  LOG(ERROR) << "Could not find any TF GPUs";
+  return std::make_pair(TfGpuId(-1), PlatformGpuId(-1));
+}
+
 // Function to get subsegment information structure.
 Status GetEngineInfo(const Graph* g,
                      const grappler::GraphProperties& graph_properties,
@@ -128,27 +146,43 @@ Status GetEngineInfo(const Graph* g,
     if (segment_nodes.count(node) == 0) continue;
     auto node_device = node->requested_device();
     if (!node_device.empty()) {
-      // If device is CPU, treat as if no device was assigned. Don't add CPU to
-      // segment_device because that would cause a segfault in
-      // GetDeviceAndAllocator. This is because GetDeviceAndAllocator assumes
-      // any already set device is a GPU.
+      // If device is set, it means device placement may have been done before,
+      // so we need to assign a device for the TRTEngineOp to maintain the
+      // invariance.
+      // If the device is CPU in this case, it tries to find the first available
+      // GPU and use it as the device.
       DeviceNameUtils::ParsedName parsed_name;
-      DeviceNameUtils::ParseFullName(node_device, &parsed_name);
-      if (parsed_name.type == "CPU") {
-        VLOG(1) << "Node " << node->name() << " was assigned to the CPU. "
-                << "Attempting to place on GPU.";
+      const bool parse_succeeded =
+          DeviceNameUtils::ParseFullName(node_device, &parsed_name);
+      if (!parse_succeeded || (parse_succeeded && parsed_name.type == "CPU")) {
+        string msg;
+        if (!parse_succeeded) {
+          msg = StrCat("Failed to parse assigned device of node ", node->name(),
+                       ". ");
+        } else {
+          msg = StrCat("Node ", node->name(), " was assigned to the CPU. ");
+        }
+        VLOG(1) << msg << "Attempting to place on GPU.";
+        TfGpuId tf_gpu_id;
+        PlatformGpuId platform_gpu_id;
+        std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
+        if (tf_gpu_id.value() >= 0) {
+          parsed_name.type = "GPU";
+          parsed_name.id = tf_gpu_id.value();
+          segment_devices.insert(DeviceNameUtils::FullName(
+              parsed_name.job, parsed_name.replica, parsed_name.task,
+              parsed_name.type, parsed_name.id));
+        }
       } else {
         segment_devices.insert(node_device);
       }
+    } else if (node->has_assigned_device_name()) {
+      // It appears that nodes will not have assigned devices at this point in
+      // execution.
+      segment_devices.insert(node->assigned_device_name());
     } else {
-      if (node->has_assigned_device_name()) {
-        // It appears that nodes will not have assigned devices at this point in
-        // execution.
-        segment_devices.insert(node->assigned_device_name());
-      } else {
-        VLOG(2) << "Node " << node->name()
-                << " neither have requested device nor assigned device";
-      }
+      VLOG(2) << "Node " << node->name()
+              << " neither have requested device nor assigned device";
     }
     subgraph_nodes.push_back(node);
 
@@ -251,13 +285,11 @@ Status GetEngineInfo(const Graph* g,
   info->engine_name = StrCat(scope_name, info->engine_name);
   VLOG(1) << "Converted TensorRT candidate segment '" << info->engine_name
           << "' to a GraphDef";
-  // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
   } else if (segment_devices.size() > 1) {
-    LOG(WARNING) << "Detected multiple(" << segment_devices.size()
-                 << ") devices for the segment. Picking first one to continue "
-                 << "but this shouldn't have happened";
+    LOG(WARNING) << "Detected multiple (" << segment_devices.size()
+                 << ") devices for the segment. Picking first one to continue.";
     info->device = *segment_devices.begin();
   } else {
     VLOG(1) << "No device is assigned to the segment. "
@@ -543,10 +575,10 @@ Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph,
   std::map<string, Node*> io_nodes;
   int num_inputs = 0;
   for (auto n : sgraph.op_nodes()) {
-    if (str_util::StartsWith(n->name(), kInputPHName)) {
+    if (absl::StartsWith(n->name(), kInputPHName)) {
       num_inputs++;
       io_nodes.insert({n->name(), n});
-    } else if (str_util::StartsWith(n->name(), kOutputPHName)) {
+    } else if (absl::StartsWith(n->name(), kOutputPHName)) {
       io_nodes.insert({n->name(), n});
     }
   }
@@ -640,24 +672,17 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
   if (params.cluster == nullptr || params.cluster->GetDeviceSet() == nullptr ||
       engine.device.empty()) {
     // If device is not set, use the first found GPU device for the conversion.
-    for (int tf_gpu_id_value = 0; tf_gpu_id_value < 100; ++tf_gpu_id_value) {
-      TfGpuId tf_gpu_id(tf_gpu_id_value);
-      PlatformGpuId platform_gpu_id;
-      Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id);
-      if (s.ok()) {
-        VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
-                << platform_gpu_id.value();
-        cuda_device_id = platform_gpu_id.value();
-        GPUOptions gpu_options;
-        // If the TF to Cuda gpu id mapping exist, the device and corresponding
-        // allocator must have been initialized already, so the
-        // GetGPUAllocator() call won't create a new allocator.
-        dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
-            gpu_options, tf_gpu_id, 1);
-        break;
-      }
-      LOG(ERROR) << "TF GPU with id " << tf_gpu_id_value << " does not exist "
-                 << s;
+    TfGpuId tf_gpu_id;
+    PlatformGpuId platform_gpu_id;
+    std::tie(tf_gpu_id, platform_gpu_id) = GetFirstValidDeviceId();
+    cuda_device_id = platform_gpu_id.value();
+    if (cuda_device_id >= 0) {
+      GPUOptions gpu_options;
+      // If the TF to Cuda gpu id mapping exist, the device and corresponding
+      // allocator must have been initialized already, so the
+      // GetGPUAllocator() call won't create a new allocator.
+      dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
+          gpu_options, tf_gpu_id, 1);
     }
     return std::make_pair(cuda_device_id, dev_allocator);
   }
@@ -750,8 +775,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     EngineInfo curr_engine;
     curr_engine.engine_name = StrCat("TRTEngineOp_", t);
     Status status =
-        GetEngineInfo(&graph, *params.graph_properties, curr_segment.first,
-                      node_map, reverse_topo_order, &curr_engine);
+        GetEngineInfo(&graph, *params.graph_properties, curr_segment, node_map,
+                      reverse_topo_order, &curr_engine);
     if (!status.ok()) {
       LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
                    << status;
@@ -776,7 +801,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
 
     engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
     total_engine_bytes_size += engine_bytes_size.back();
-    total_num_nodes_in_segments += curr_segment.first.size();
+    total_num_nodes_in_segments += curr_segment.size();
     engine_segments.push_back(std::move(curr_engine));
     converted_segments.push_back(std::move(curr_segment));
 
@@ -806,7 +831,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     engine.max_workspace_size_bytes =
         params.max_workspace_size_bytes *
         (engine_bytes_size.at(i) / total_engine_bytes_size +
-         converted_segments.at(i).first.size() / total_num_nodes_in_segments) /
+         converted_segments.at(i).size() / total_num_nodes_in_segments) /
         2.0;
     VLOG(1) << "Assigned " << engine.max_workspace_size_bytes << " bytes to "
             << engine.engine_name;
@@ -828,9 +853,9 @@ Status ConvertAfterShapes(const ConversionParams& params) {
         CreateTRTNode(params, engine_segments, i, params.max_batch_size, &graph,
                       alloc.get(), &engine_nodes);
 
-    string msg = StrCat("TensorRT node ", engine.engine_name,
-                        " added for segment ", i, " consisting of ",
-                        converted_segments.at(i).first.size(), " nodes");
+    string msg =
+        StrCat("TensorRT node ", engine.engine_name, " added for segment ", i,
+               " consisting of ", converted_segments.at(i).size(), " nodes");
     if (status.ok()) {
       LOG(INFO) << msg << " succeeded.";
     } else {
@@ -839,7 +864,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     }
     if (VLOG_IS_ON(1)) {
       msg = "Segment consists of nodes: ";
-      for (const Node* node : converted_segments.at(i).first) {
+      for (const Node* node : converted_segments.at(i)) {
         StrAppend(&msg, node->name(), ", ");
       }
       VLOG(1) << msg;
@@ -848,7 +873,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     // If status is ok, we successfully added the node to the graph and can
     // remove segment ops. Otherwise graph is not modified.
     if (status.ok()) {
-      for (const Node* node : converted_segments.at(i).first) {
+      for (const Node* node : converted_segments.at(i)) {
         graph.RemoveNode(const_cast<Node*>(node));
       }
     }
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index d8db0ffac7e..647c9b5068b 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -239,7 +239,7 @@ class ConvertAfterShapesTest : public ::testing::Test {
     params.output_names = &output_names;
     params.max_workspace_size_bytes = 8 << 20;
     params.output_graph_def = output_graph_def;
-    params.minimum_segment_size = 2;
+    params.minimum_segment_size = 1;
     params.graph_properties = &graph_properties;
     params.use_calibration = false;
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 0ac508822f1..a1ccb3b3e6e 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -385,11 +385,10 @@ string DebugString(const nvinfer1::ITensor& tensor) {
                 ", dims=", DebugString(tensor.getDimensions()), ")");
 }
 
-Status Converter::GetTrtBroadcastShape(
-    const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r,
-    nvinfer1::Dims* operand_l_new_dims,
-    nvinfer1::Dims* operand_r_new_dims) const {
-  // ***************************************************************************
+Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
+                            const TRT_TensorOrWeights& operand_r,
+                            nvinfer1::Dims* operand_l_new_dims,
+                            nvinfer1::Dims* operand_r_new_dims) {
   // TensorRT Elementwise op supports broadcast but requires both tensor to be
   // of Identical rank
   //
@@ -473,14 +472,13 @@ nvinfer1::ITensor* Converter::CreateConstantLayer(
   nvinfer1::Weights trt_weights = weights.GetTrtWeights();
   nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights);
   if (!layer) return nullptr;
-  const nvinfer1::DataType trt_dtype = trt_weights.type;
   nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
 #if !IS_TRT_VERSION_GE(5, 1, 3, 0)
   // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set
   // the data type below, it will always be kFLOAT regardless what the data type
   // of the weights is. Once NVIDIA fixes this bug, we should remove the data
   // type setting logic below and test should still pass.
-  trt_tensor->setType(trt_dtype);
+  trt_tensor->setType(trt_weights.type);
 #endif
   return trt_tensor;
 }
@@ -1677,190 +1675,6 @@ Status UnaryCompute(const TRT_ShapedWeights& iweights,
   return Status::OK();
 }
 
-// If swapped_inputs is false, 'tensor' is the left operand and 'weights' is the
-// right operand. If swapped_inputs is true, those two are swapped.
-//
-// TODO(jie): broadcast is needed yet not implemented.
-// Only implemented channel wise for the time being.
-Status BinaryTensorOpWeight(OpConverterParams* params,
-                            nvinfer1::ITensor* tensor,
-                            TRT_ShapedWeights weights, bool swapped_inputs) {
-  static const std::unordered_set<string> supported_ops = {"Sub", "Add", "Mul",
-                                                           "Div", "RealDiv"};
-  const auto& node_def = params->node_def;
-  if (!supported_ops.count(node_def.op())) {
-    return errors::Unimplemented(node_def.op(), " is not supported, at ",
-                                 node_def.name());
-  }
-
-  // Check scale mode.
-  auto dims_w = weights.shape_;
-  const auto dims_t = tensor->getDimensions();
-
-  // TODO(jie): addScale checks for input tensor dimension
-  if (dims_t.nbDims != 3) {
-    return errors::InvalidArgument("addScale requires tensor with rank 3, at ",
-                                   node_def.name());
-  }
-
-  // Default to element-wise
-  auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-
-  // TODO(jie): maybe use a permutation instead to support more cases;
-  bool need_to_permute = false;
-
-  if (weights.count() == 1) {
-    scale_mode = nvinfer1::ScaleMode::kUNIFORM;
-  } else {
-    VLOG(2) << "weights dims: " << DebugString(dims_w)
-            << "; tensor dims: " << DebugString(dims_t);
-    // Make sure no broadcasting on batch dimension.
-    if (dims_w.nbDims == dims_t.nbDims + 1) {
-      if (dims_w.d[0] == 1) {
-        for (int i = 1; i < dims_w.nbDims; i++) {
-          dims_w.d[i - 1] = dims_w.d[i];
-        }
-        dims_w.nbDims--;
-      } else {
-        return errors::InvalidArgument("Binary op cannot operate on batch, at ",
-                                       node_def.name());
-      }
-    }
-
-    if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
-      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      // Default is element-wise
-      for (int i = 1; i < dims_w.nbDims; i++) {
-        if (dims_w.d[i] != dims_t.d[i]) {
-          // If dimension does not match, switch back to per-channel
-          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-          break;
-        }
-      }
-      // If the mode is per-channel, since channel dimension is assumed to be
-      // the third to last dimension, we need to make sure all other dimensions
-      // have size 1.
-      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
-        for (int i = 1; i < dims_w.nbDims; i++) {
-          if (dims_w.d[i] != 1)
-            return errors::InvalidArgument(
-                "Weight dims not compatible for channel-wise broadcast at ",
-                node_def.name());
-        }
-      }
-    } else if (dims_w.nbDims == 1 &&
-               dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
-      // Channel wise and broadcast required. We compare the last dimension of
-      // the tensor shape because of tensorflow default broadcasting rules.
-      need_to_permute = true;
-      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-    } else {
-      return errors::InvalidArgument("Weight dims not compatible at ",
-                                     node_def.name());
-    }
-  }
-  // TODO(laigd): we should add validation_only support in TransposeTensor() and
-  // PrepareTensorForShape().
-  if (params->validation_only) return Status::OK();
-
-  // Transpose last dimension.
-  std::vector<int> permutation(dims_t.nbDims + 1);
-  if (need_to_permute) {
-    // We swap the last dimension into channel for trt, because of tensorflow
-    // default broadcasting rules.
-    for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
-      permutation[i] = i;
-    }
-    permutation[1] = dims_t.nbDims;
-    permutation[dims_t.nbDims] = 1;
-    TF_RETURN_IF_ERROR(
-        params->converter->TransposeTensor(tensor, permutation, &tensor));
-  }
-
-  // Prepare weights
-  TRT_ShapedWeights shift_weights(weights.TrtDType());
-  TRT_ShapedWeights scale_weights(weights.TrtDType());
-  TRT_ShapedWeights power_weights(weights.TrtDType());
-
-  if (node_def.op() == "Sub") {
-    if (swapped_inputs) {
-      shift_weights = weights;
-      nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
-          *tensor, nvinfer1::UnaryOperation::kNEG);
-      TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-      // Since quantization ranges are symmetric, the same range as the input
-      // will work for the negation of the input.
-      params->converter->MarkQuantizationRangesAsInferrable(
-          tensor, layer->getOutput(0));
-      tensor = layer->getOutput(0);
-    } else {
-      TRT_ShapedWeights neg_weights =
-          params->weight_store->GetTempWeights(weights);
-      LambdaFactory unary_op;
-      unary_op.op = LambdaFactory::OP_CATEGORY::NEG;
-      TF_RETURN_IF_ERROR(UnaryCompute(weights, &neg_weights, unary_op));
-      shift_weights = neg_weights;
-    }
-  } else if (node_def.op() == "Div" || node_def.op() == "RealDiv") {
-    if (swapped_inputs) {
-      // We need to infer the quantization range for this intermediate tensor.
-      //
-      //   x -> [Recip] -> 1/x -> [Scale] -> s/x
-      //                    ^
-      //            need range for this
-      //
-      // We have the quantization scales for x and s/x - can we divide the scale
-      // for s/x by s? Only if it is a scalar.
-      //
-      // Because of this issue, fall back to BinaryTensorOpTensor if we are
-      // doing INT8 with no calibration. There is most likely no performance
-      // penalty by falling back here.
-      if (params->converter->precision_mode() == TrtPrecisionMode::INT8 &&
-          !params->converter->use_calibration()) {
-        return errors::Unimplemented(
-            "Intermediate quantization range cannot be determined without"
-            " calibration. Falling back to BinaryTensorOpTensor for ",
-            node_def.op(), ", at ", node_def.name());
-      }
-      scale_weights = weights;
-      nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
-          *tensor, nvinfer1::UnaryOperation::kRECIP);
-      TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-      tensor = layer->getOutput(0);
-    } else {
-      TRT_ShapedWeights recip_weights =
-          params->weight_store->GetTempWeights(weights);
-      LambdaFactory unary_op;
-      unary_op.op = LambdaFactory::OP_CATEGORY::RECIP;
-      TF_RETURN_IF_ERROR(UnaryCompute(weights, &recip_weights, unary_op));
-      scale_weights = recip_weights;
-    }
-  } else if (node_def.op() == "Mul") {
-    scale_weights = weights;
-  } else if (node_def.op() == "Add") {
-    shift_weights = weights;
-  } else {
-    // This should not happen.
-    return errors::Unimplemented("Binary op not supported at ", node_def.op());
-  }
-
-  nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
-      *tensor, scale_mode, shift_weights.GetTrtWeights(),
-      scale_weights.GetTrtWeights(), power_weights.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // Transpose back dimension
-  if (need_to_permute) {
-    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, permutation, &output_tensor));
-  }
-
-  // Pass the output
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
-}
-
 Status ConvertConv2DHelper(OpConverterParams* params, int group,
                            bool is_conv2d_backprop_input) {
   const auto& inputs = params->inputs;
@@ -1951,7 +1765,8 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   kernel_size.h() = weights.shape_.d[2];
   kernel_size.w() = weights.shape_.d[3];
 
-  // Add padding.
+// Before TRT 5.1.3, we have to calculate padding ourselves.
+#if !IS_TRT_VERSION_GE(5, 1, 3, 0)
   std::vector<std::pair<int, int>> padding;
   if (attrs.get<string>("padding") == "SAME") {
     nvinfer1::DimsHW effective_kernel_size = kernel_size;
@@ -1978,12 +1793,12 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
     padding = {{0, 0}, {0, 0}};
   }
 
-// TensorRT 5.1 added support for asymmetric padding. Due to a bug in 5.1.2, we
-// can only use asymmetric padding in convolutions with 5.1.3+.
-#if !IS_TRT_VERSION_GE(5, 1, 3, 0)
+  // Handle asymmetric padding. TensorRT 5.1 added support for asymmetric
+  // padding via setPrePadding and setPostPadding. Due to a bug in 5.1.2, we can
+  // only use asymmetric padding in convolutions with 5.1.3+. But in 5.1.3, we
+  // will always use setPaddingMode for simplicity.
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
-    // Handle asymmetric padding.
     auto pad_layer = params->converter->network()->addPadding(
         *tensor, nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
@@ -2006,20 +1821,13 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
     layer->setStride(stride);
 // TensorRT 5.1.3 added support for padding modes.
 #if IS_TRT_VERSION_GE(5, 1, 3, 0)
+    // VALID padding is the default TRT behavior.
     if (attrs.get<string>("padding") == "SAME") {
-      VLOG(2) << "Using SAME padding";
       // SAME_UPPER means that post padding is preferred.
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
-    // For VALID padding, we need to manually set the padding.
-    layer->setPrePadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-    layer->setPostPadding(
-        nvinfer1::DimsHW{padding[0].second, padding[1].second});
-    VLOG(2) << "Set pre-padding to: " << DebugString(layer->getPrePadding())
-            << " and post-padding to: " << DebugString(layer->getPostPadding());
 #else
     layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-    VLOG(2) << "Set padding to: " << DebugString(layer->getPadding());
 #endif
     layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
@@ -2033,17 +1841,10 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
     layer->setStride(stride);
 #if IS_TRT_VERSION_GE(5, 1, 3, 0)
     if (attrs.get<string>("padding") == "SAME") {
-      VLOG(2) << "Using SAME padding";
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
-    layer->setPrePadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-    layer->setPostPadding(
-        nvinfer1::DimsHW{padding[0].second, padding[1].second});
-    VLOG(2) << "Set pre-padding to: " << DebugString(layer->getPrePadding())
-            << " and post-padding to: " << DebugString(layer->getPostPadding());
 #else
     layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
-    VLOG(2) << "Set padding to: " << DebugString(layer->getPadding());
 #endif
     layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
@@ -2061,74 +1862,6 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   return Status::OK();
 }
 
-Status BinaryTensorOpTensor(OpConverterParams* params,
-                            const TRT_TensorOrWeights& operand_l,
-                            const TRT_TensorOrWeights& operand_r) {
-  const auto& node_def = params->node_def;
-  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
-      {"Add", nvinfer1::ElementWiseOperation::kSUM},
-      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
-      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
-      {"Div", nvinfer1::ElementWiseOperation::kDIV},
-      {"RealDiv", nvinfer1::ElementWiseOperation::kDIV},
-      {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
-      {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
-      {"Pow", nvinfer1::ElementWiseOperation::kPOW},
-  };
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end()) {
-    return errors::Unimplemented("Binary op ", node_def.op(),
-                                 " not supported at: ", node_def.name());
-  }
-
-  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
-  Status status = params->converter->GetTrtBroadcastShape(
-      operand_l, operand_r, &broadcasted_dims_l, &broadcasted_dims_r);
-  if (!status.ok()) {
-    return errors::InvalidArgument(
-        "Unsupported binary op broadcast scheme for op ", node_def.name(), ": ",
-        status.error_message());
-  }
-  TFAttrs attrs(node_def);
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
-  if (dtype == nvinfer1::DataType::kINT32) {
-    return errors::Unimplemented("Binary op ", node_def.op(),
-                                 " does not support INT32, at ",
-                                 node_def.name());
-  }
-  if (params->validation_only) return Status::OK();
-
-  nvinfer1::ITensor* tensor_l = nullptr;
-  nvinfer1::ITensor* tensor_r = nullptr;
-  status = params->converter->PrepareTensorForShape(
-      operand_l, broadcasted_dims_l, /*validation_only=*/false, &tensor_l);
-  if (status.ok()) {
-    status = params->converter->PrepareTensorForShape(
-        operand_r, broadcasted_dims_r, /*validation_only=*/false, &tensor_r);
-  }
-  if (!status.ok()) {
-    return errors::Internal("Failed to convert binary op ", node_def.name(),
-                            ": ", status.error_message());
-  }
-
-  // Check type consistency.
-  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype)
-      << DebugString(tensor_l->getType()) << " vs " << DebugString(dtype);
-  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype)
-      << DebugString(tensor_r->getType()) << " vs " << DebugString(dtype);
-
-  // Add ElementWise layer.
-  nvinfer1::IElementWiseLayer* layer =
-      params->converter->network()->addElementWise(*tensor_l, *tensor_r,
-                                                   op_pair->second);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-  // Pass the output
-  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return Status::OK();
-}
-
 Status ConvertPlugin(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -2777,6 +2510,8 @@ Status ConvertPool(OpConverterParams* params) {
   const auto tf_kernel = attrs.get<std::vector<int64>>("ksize");
   const nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]);
 
+// Before TRT 5.1.3, we have to calculate padding ourselves.
+#if !IS_TRT_VERSION_GE(5, 1, 3, 0)
   auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
   if (padding_type == "SAME") {
@@ -2789,13 +2524,13 @@ Status ConvertPool(OpConverterParams* params) {
   } else if (padding_type == "VALID") {
     padding = {{0, 0}, {0, 0}};
   }
-
-// TensorRT 5.1 added support for asymmetric padding.
+#endif
+// TensorRT 5.1 added support for asymmetric padding. Before that, we need an
+// extra padding layer.
 #if !IS_TRT_VERSION_GE(5, 1, 0, 0)
+  // Asymmetric padding case.
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
-    VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
-            << padding[1].first << padding[1].second;
     auto pad_layer = params->converter->network()->addPadding(
         *tensor, nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
@@ -2817,16 +2552,13 @@ Status ConvertPool(OpConverterParams* params) {
                                                         layer->getOutput(0));
 
   layer->setStride(stride);
-// TensorRT 5.1.3 added support for padding modes.
 #if IS_TRT_VERSION_GE(5, 1, 3, 0)
+  // VALID padding is the default TRT behavior.
   if (attrs.get<string>("padding") == "SAME") {
     // SAME_UPPER means that post padding is preferred.
     layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
   }
-#endif
-// TensorRT 5.1 has support for asymmetric padding.
-#if IS_TRT_VERSION_GE(5, 1, 0, 0)
-  // If padding mode is not SAME, then these values will be used instead.
+#elif IS_TRT_VERSION_GE(5, 1, 0, 0)
   layer->setPrePadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
   layer->setPostPadding(nvinfer1::DimsHW{padding[0].second, padding[1].second});
 #else
@@ -3350,9 +3082,6 @@ Status ConvertIdentity(OpConverterParams* params) {
 Status ConvertBinary(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  // TODO(tmorris): Enable once false is updated to mean either tensor or weight
-  // TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y",
-  // false}}));
   if (inputs.size() != 2) {
     return errors::InvalidArgument(node_def.op(), " got ", inputs.size(),
                                    " inputs but expected 2, at ",
@@ -3368,33 +3097,45 @@ Status ConvertBinary(OpConverterParams* params) {
         "both input as constant at: ",
         node_def.name());
   }
+  const TRT_TensorOrWeights& operand_l = inputs.at(0);
+  const TRT_TensorOrWeights& operand_r = inputs.at(1);
 
-  // TODO(tmorris): TRT plans to deprecate IScaleLayer and will replace it with
-  // IElementwiseLayer. At that point, we can remove BinaryTensorOpWeight. For
-  // now, the performance will be slightly better with IScaleLayer because it
-  // can be fused in more situations. However, most of the benefits of
-  // IScaleLayer are when the layer performs both a shift and a scale, which we
-  // don't do except for convolutions.
-  //
-  // Try to convert into Scale layer first (for better performance).
-  // Since scale layer supports restricted broadcast policy and op types, we
-  // allow failure and try to handle it through Elementwise op
-  // (BinaryTensorOpTensor).
-  Status status = Status::OK();
-  if (inputs.at(0).is_tensor() && inputs.at(1).is_weights()) {
-    status = BinaryTensorOpWeight(params, inputs.at(0).tensor(),
-                                  inputs.at(1).weights(), false);
-  } else if (inputs.at(0).is_weights() && inputs.at(1).is_tensor()) {
-    status = BinaryTensorOpWeight(params, inputs.at(1).tensor(),
-                                  inputs.at(0).weights(), true);
+  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
+      {"Add", nvinfer1::ElementWiseOperation::kSUM},
+      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
+      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
+      {"Div", nvinfer1::ElementWiseOperation::kDIV},
+      {"RealDiv", nvinfer1::ElementWiseOperation::kDIV},
+      {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
+      {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
+      {"Pow", nvinfer1::ElementWiseOperation::kPOW},
+  };
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end()) {
+    return errors::Unimplemented("Binary op ", node_def.op(),
+                                 " not supported at: ", node_def.name());
   }
-  // If both input are tensors, or one of them is weights but the conversion
-  // above failed, try the conversion using BinaryTensorOpTensor.
-  if ((inputs.at(0).is_tensor() && inputs.at(1).is_tensor()) || !status.ok()) {
-    if (!status.ok()) VLOG(2) << status;
-    status = BinaryTensorOpTensor(params, inputs.at(0), inputs.at(1));
-  }
-  return status;
+
+  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
+  TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
+      operand_l, operand_r, &broadcasted_dims_l, &broadcasted_dims_r));
+
+  nvinfer1::ITensor* tensor_l = nullptr;
+  nvinfer1::ITensor* tensor_r = nullptr;
+  // This will also convert constants to tensors, and set quantization ranges.
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      operand_l, broadcasted_dims_l, params->validation_only, &tensor_l));
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      operand_r, broadcasted_dims_r, params->validation_only, &tensor_r));
+  if (params->validation_only) return Status::OK();
+
+  // Add ElementWise layer.
+  nvinfer1::IElementWiseLayer* layer =
+      params->converter->network()->addElementWise(*tensor_l, *tensor_r,
+                                                   op_pair->second);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
 }
 
 Status ConvertRsqrt(OpConverterParams* params) {
@@ -4547,7 +4288,7 @@ Status ConvertSquaredDifference(OpConverterParams* params) {
   const auto& node_def = params->node_def;
   // Broadcast inputs.
   nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
-  TF_RETURN_IF_ERROR(params->converter->GetTrtBroadcastShape(
+  TF_RETURN_IF_ERROR(GetTrtBroadcastShape(
       inputs.at(0), inputs.at(1), &broadcasted_dims_l, &broadcasted_dims_r));
   nvinfer1::ITensor* tensor_l = nullptr;
   nvinfer1::ITensor* tensor_r = nullptr;
@@ -4692,8 +4433,8 @@ Status ConvertCombinedNMS(OpConverterParams* params) {
   TFTRT_RETURN_ERROR_IF_NULLPTR(creator, node_def.name());
 
   // Create plugin
-  nvinfer1::IPluginV2* plugin =
-      creator->createPlugin(node_def.name().c_str(), &fc);
+  TrtUniquePtrType<nvinfer1::IPluginV2> plugin(
+      creator->createPlugin(node_def.name().c_str(), &fc));
   TFTRT_RETURN_ERROR_IF_NULLPTR(plugin, node_def.name());
 
   // Set plugin inputs
@@ -4875,7 +4616,8 @@ static void RegisterValidatableOpConverters(
   for (auto pool_op_type : {"AvgPool", "MaxPool"}) {
     (*registration)[pool_op_type] = ConvertPool;
   }
-  for (auto normalization_op_type : {"FusedBatchNorm", "FusedBatchNormV2"}) {
+  for (auto normalization_op_type :
+       {"FusedBatchNorm", "FusedBatchNormV2", "FusedBatchNormV3"}) {
     (*registration)[normalization_op_type] = ConvertFusedBatchNorm;
   }
   for (auto unary_op_pair : *UnaryOperationMap()) {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 763b28b7402..d0f6d5ef1d1 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -512,13 +512,6 @@ class Converter {
                                const bool validation_only,
                                nvinfer1::ITensor** tensor);
 
-  // Return OK if the broadcast scheme is supported and compute the shapes after
-  // broadcasting.
-  Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
-                              const TRT_TensorOrWeights& operand_r,
-                              nvinfer1::Dims* operand_l_new_dims,
-                              nvinfer1::Dims* operand_r_new_dims) const;
-
   // Creates an IConstantLayer using 'weights' whose dimensions are specified by
   // 'dims', and returns the output ITensor.
   nvinfer1::ITensor* CreateConstantLayer(const TRT_ShapedWeights& weights,
@@ -592,6 +585,13 @@ class Converter {
   friend class OpConverterTest;
 };
 
+// Return OK if the broadcast scheme is supported and compute the shapes after
+// broadcasting.
+Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
+                            const TRT_TensorOrWeights& operand_r,
+                            nvinfer1::Dims* operand_l_new_dims,
+                            nvinfer1::Dims* operand_r_new_dims);
+
 // Map of all supported UnaryOperations
 const std::unordered_map<string, nvinfer1::UnaryOperation>* UnaryOperationMap();
 // Map of all supported ActivationTypes
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index c4ba69c1393..09b7a60c083 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -988,19 +988,17 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
         operand_2_shape, operand_2_is_tensor, operand_2_batch_size);
 
     // operand_1 broadcast operand_2
-    ExpectStatus(
-        this->converter_->GetTrtBroadcastShape(
-            operand_1, operand_2, &operand_1_new_dims, &operand_2_new_dims),
-        expected_code, expected_error_msg_substr);
+    ExpectStatus(GetTrtBroadcastShape(operand_1, operand_2, &operand_1_new_dims,
+                                      &operand_2_new_dims),
+                 expected_code, expected_error_msg_substr);
     if (expected_code == error::OK) {
       ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
       ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
     }
     // operand_2 broadcast operand_1
-    ExpectStatus(
-        this->converter_->GetTrtBroadcastShape(
-            operand_2, operand_1, &operand_2_new_dims, &operand_1_new_dims),
-        expected_code, expected_error_msg_substr);
+    ExpectStatus(GetTrtBroadcastShape(operand_2, operand_1, &operand_2_new_dims,
+                                      &operand_1_new_dims),
+                 expected_code, expected_error_msg_substr);
     if (expected_code == error::OK) {
       ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
       ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
@@ -1033,18 +1031,29 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
                  error::INVALID_ARGUMENT,
                  "Broadcasting beyond batch dimension is not supported "
                  "(tensor #dims 4 vs broadcast #dims 5)");
+  symmetric_test({3}, {1, 1, 3}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 2 vs broadcast #dims 3)",
+                 /*operand_1_batch_size=*/2);
 
   // Both inputs are tensors.
   symmetric_test({1, 1, 1}, {1, 1}, kIsTensor, kIsTensor, {}, {},
                  error::INVALID_ARGUMENT,
                  "Broadcasting beyond batch dimension is not supported "
                  "(tensor #dims 3 vs broadcast #dims 4)");
+  symmetric_test({1, 3}, {3}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 2 vs broadcast #dims 3)");
   symmetric_test({1, 3, 4}, {2, 1, 4}, kIsTensor, kIsTensor, {1, 3, 4},
                  {2, 1, 4});
   symmetric_test({1, 1, 1}, {1, 1, 1, 1}, kIsTensor, kIsTensor, {}, {},
                  error::INVALID_ARGUMENT,
                  "Broadcasting beyond batch dimension is not supported "
                  "(tensor #dims 4 vs broadcast #dims 5)");
+  symmetric_test({2, 3}, {7, 5}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
 }
 
 TEST_F(ConverterTest, CreateConstantLayer) {
@@ -1070,7 +1079,7 @@ class ConvertGraphDefToEngineTest : public ::testing::Test {
     int batch_size = -1;
     for (const NodeDef& node : gdef.node()) {
       absl::string_view node_name(node.name());
-      if (str_util::ConsumePrefix(&node_name, kInputPHName)) {
+      if (absl::ConsumePrefix(&node_name, kInputPHName)) {
         int port = -1;
         EXPECT_TRUE(absl::SimpleAtoi(node_name, &port)) << node.name();
         if (input_shapes.size() < port + 1) input_shapes.resize(port + 1);
@@ -1351,10 +1360,6 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
-  void TestMatMulHelper(
-      const std::function<NodeDef(DataType, bool, bool)>& get_matmul,
-      const std::string& op_name);
-
   // Expose quantization_ranges_ for tests
   std::unordered_map<nvinfer1::ITensor*, float>& quantization_ranges() {
     return converter_->quantization_ranges_;
@@ -1682,59 +1687,60 @@ TEST_F(OpConverterTest, ConvertReshape) {
 // Helper function for testing MatMul and BatchMatMul
 // get_matmul corresponds to the function used to generate the node. It should
 // accept (DataType, transpose_a, transpose_b) as parameters.
-void OpConverterTest::TestMatMulHelper(
+void TestMatMulHelper(
+    OpConverterTest* test,
     const std::function<NodeDef(DataType, bool, bool)>& get_matmul,
     const std::string& op_name) {
   // HACK: This needs to be done in a better way.
   const bool is_batch_matmul = op_name == "BatchMatMul";
   {
     // Unsupported data type.
-    Reset();
+    test->Reset();
     NodeDef node_def = get_matmul(DT_INT32, false, false);
-    AddTestTensor("input", {2}, /*batch_size=*/1, nvinfer1::DataType::kINT32);
-    AddTestWeights<int32>("weights", {2, 1}, {3, 5});
-    RunValidationAndConversion(
+    test->AddTestTensor("input", {2}, /*batch_size=*/1,
+                        nvinfer1::DataType::kINT32);
+    test->AddTestWeights<int32>("weights", {2, 1}, {3, 5});
+    test->RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        ("Data type int32 is not supported for " + op_name +
-         ", "
-         "must be one of [float, half], at my_matmul")
+        StrCat("Data type int32 is not supported for ", op_name,
+               ", must be one of [float, half], at my_matmul")
             .c_str());
   }
   // OK.
   for (bool transpose_a : {false, true}) {
     for (bool transpose_b : {false, true}) {
-      Reset();
+      test->Reset();
       NodeDef node_def = get_matmul(DT_FLOAT, transpose_a, transpose_b);
-      AddTestTensor("input", {2}, /*batch_size=*/1);
-      AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+      test->AddTestTensor("input", {2}, /*batch_size=*/1);
+      test->AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
       if (is_batch_matmul) {
         if (transpose_a || transpose_b) {
-          RunValidationAndConversion(
+          test->RunValidationAndConversion(
               node_def, error::INVALID_ARGUMENT,
               "Input weight attempts to broadcast across batch dimension for "
               "BatchMatMul, at my_matmul");
         } else {
-          RunValidationAndConversion(
+          test->RunValidationAndConversion(
               node_def, error::INVALID_ARGUMENT,
               "Input weight attempts to broadcast across batch dimension");
         }
         continue;
       } else if (transpose_a) {
-        RunValidationAndConversion(
+        test->RunValidationAndConversion(
             node_def, error::INVALID_ARGUMENT,
             "Cannot transpose first input if it is a tensor with fewer than 2 "
             "non-batch dimensions");
         continue;
       }
-      RunValidationAndConversion(node_def);
+      test->RunValidationAndConversion(node_def);
       TRT_TensorOrWeights output;
-      TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
+      TF_EXPECT_OK(test->GetTensorOrWeights("my_matmul", &output));
       ASSERT_TRUE(output.is_tensor());
       ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
 
       const DataVec input_data{{"input", test::AsTensor<float>({0, 1})}};
       DataVec output_data{{"my_matmul", ConstructTensor<float>(2)}};
-      BuildAndRun(input_data, &output_data);
+      test->BuildAndRun(input_data, &output_data);
       if (transpose_b) {
         EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
       } else {
@@ -1744,31 +1750,31 @@ void OpConverterTest::TestMatMulHelper(
   }
   // OK, 3D inputs
   for (bool transpose_b : {false, true}) {
-    Reset();
+    test->Reset();
     NodeDef node_def = get_matmul(DT_FLOAT, /*transpose_a=*/false, transpose_b);
-    AddTestTensor("input", {2}, /*batch_size=*/1);
-    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    test->AddTestTensor("input", {2}, /*batch_size=*/1);
+    test->AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
     if (is_batch_matmul) {
       if (transpose_b) {
-        RunValidationAndConversion(
+        test->RunValidationAndConversion(
             node_def, error::INVALID_ARGUMENT,
             "Input weight attempts to broadcast across batch dimension for "
             "BatchMatMul, at my_matmul");
       } else {
-        RunValidationAndConversion(
+        test->RunValidationAndConversion(
             node_def, error::INVALID_ARGUMENT,
             "Input weight attempts to broadcast across batch dimension");
       }
       continue;
     }
-    RunValidationAndConversion(node_def);
+    test->RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_matmul", &output));
     ASSERT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
     const DataVec input_data{{"input", test::AsTensor<float>({0, 1})}};
     DataVec output_data{{"my_matmul", ConstructTensor<float>(2)}};
-    BuildAndRun(input_data, &output_data);
+    test->BuildAndRun(input_data, &output_data);
     if (transpose_b) {
       EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
     } else {
@@ -1832,7 +1838,7 @@ TEST_F(OpConverterTest, ConvertMatMul) {
         node_def, error::INVALID_ARGUMENT,
         "Cannot currently transpose constant input if it is not 2 dimensional");
   }
-  TestMatMulHelper(get_matmul_nodedef, "MatMul");
+  TestMatMulHelper(this, get_matmul_nodedef, "MatMul");
 }
 
 TEST_F(OpConverterTest, ConvertBatchMatMul) {
@@ -1889,7 +1895,7 @@ TEST_F(OpConverterTest, ConvertBatchMatMul) {
     }
   }
 
-  TestMatMulHelper(get_batch_matmul_nodedef, "BatchMatMul");
+  TestMatMulHelper(this, get_batch_matmul_nodedef, "BatchMatMul");
 }
 
 template <DataType dtype>
@@ -2010,250 +2016,82 @@ void CheckAddedLayers(OpConverterTest* test, bool expect_scale_layer) {
 }
 
 template <typename OpType, DataType dtype>
-void TestBinaryTensorOpWeightNoBroadcast(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-  for (auto swap_inputs : {false, true}) {
-    test->Reset();
-    NodeDef node_def;
-    if (swap_inputs) {
-      node_def = GetBinaryOpNodeDef<OpType>("weights", "input", dtype);
-    } else {
-      node_def = GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
-    }
-
-    const std::vector<CType> operand1{CType(3), CType(7.5)};
-    const std::vector<CType> operand2{CType(2), CType(3)};
-
-    // It requires the dims to be at least of rank 3 to apply an IScaleLayer.
-    test->AddTestTensor("input", /*dims=*/{1, 1, 2}, /*batch_size=*/1,
-                        TfDataTypeToTrt(dtype));
-    test->AddTestWeights<CType>("weights", /*dims=*/{1, 1, 2},
-                                /*values=*/swap_inputs ? operand1 : operand2);
-    test->RunValidationAndConversion(node_def);
-
-    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
-    CheckAddedLayers(test, /*expect_scale_layer=*/true);
-
-    // Check the dims of the output ITensor.
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 1, 2}, output.tensor()->getDimensions());
-
-    const DataVec input_data{
-        {"input", test::AsTensor<CType>(swap_inputs ? operand2 : operand1)}};
-    DataVec output_data{{"my_binary", ConstructTensor<CType>(2)}};
-    test->BuildAndRun(
-        input_data, &output_data,
-        dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
-    if (node_def.op() == "Add") {
-      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                  ElementsAre(CType(5), CType(10.5)));
-    } else if (node_def.op() == "Sub") {
-      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                  ElementsAre(CType(1), CType(4.5)));
-    } else if (node_def.op() == "Mul") {
-      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                  ElementsAre(CType(6), CType(22.5)));
-    } else if (node_def.op() == "Div") {
-      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                  ElementsAre(CType(1.5), CType(2.5)));
-    } else if (node_def.op() == "RealDiv") {
-      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                  ElementsAre(CType(1.5), CType(2.5)));
-    } else {
-      ASSERT_TRUE(false);
-    }
-  }
-}
-
-template <DataType dtype>
-void TestBinaryTensorOpWeightWithChannelWiseBroadcast(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-  const NodeDef node_def =
-      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
-  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
-  const std::vector<CType> weights{CType(10), CType(20)};
-  // There are two types of valid dim pairs which requires channel-wise
-  // broadcasting:
-  // - input dims (X Y Z) vs weights dims (X 1 1)
-  // - input dims (X Y Z) vs weights dims (Z)
-  // Here X=Z=2 and Y=1.
-  for (auto weights_dims : std::vector<std::vector<int>>{{2, 1, 1}, {2}}) {
-    test->Reset();
-    test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
-                        TfDataTypeToTrt(dtype));
-    test->AddTestWeights<CType>("weights", weights_dims, weights);
-    test->RunValidationAndConversion(node_def);
-
-    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
-    CheckAddedLayers(test, /*expect_scale_layer=*/true);
-
-    // Check the dims of the output ITensor.
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
-
-    const DataVec input_data{{"input", test::AsTensor<CType>(input)}};
-    DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
-    test->BuildAndRun(input_data, &output_data);
-    if (weights_dims.size() == 1) {
-      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                  ElementsAre(CType(11), CType(22), CType(13), CType(24)));
-    } else {
-      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                  ElementsAre(CType(11), CType(12), CType(23), CType(24)));
-    }
-  }
-}
-
-template <DataType dtype>
-void TestBinaryTensorOpWeightWithUniformlyBroadcast(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-  const NodeDef node_def =
-      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
-  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
-  const std::vector<CType> weights{CType(10)};
-  test->Reset();
-  test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
-                      TfDataTypeToTrt(dtype));
-  test->AddTestWeights<CType>("weights", {1, 1, 1, 1}, weights);
-  test->RunValidationAndConversion(node_def);
-
-  // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
-  CheckAddedLayers(test, /*expect_scale_layer=*/true);
-
-  // Check the dims of the output ITensor.
-  TRT_TensorOrWeights output;
-  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
-  ASSERT_TRUE(output.is_tensor());
-  ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
-
-  const DataVec input_data{{"input", test::AsTensor<CType>(input)}};
-  DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
-  test->BuildAndRun(input_data, &output_data);
-  EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-              ElementsAre(CType(11), CType(12), CType(13), CType(14)));
-}
-
-template <typename OpType>
-void TestBinaryTensorOpWeightFallback(OpConverterTest* test,
-                                      const std::vector<int32>& input_dims,
-                                      const std::vector<int>& weights_dims,
-                                      error::Code code = error::OK,
-                                      const char* error_msg_substr = nullptr,
-                                      const int input_batch_size = 1) {
-  const DataType dtype = DT_FLOAT;
-  typedef typename EnumToDataType<dtype>::Type CType;
-  const size_t num_inputs = TrtTensorDimsNumElements(GetTestDims(input_dims));
-  const size_t num_weights =
-      TrtWeightDimsNumElements(GetTestDims(weights_dims));
-
-  test->Reset();
-  const NodeDef node_def =
-      GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
-  test->AddTestTensor("input", /*dims=*/input_dims, input_batch_size,
-                      TfDataTypeToTrt(dtype));
-  test->AddTestWeights<CType>(
-      "weights", /*dims=*/weights_dims,
-      /*values=*/std::vector<CType>(num_weights, CType(1)));
-  test->RunValidationAndConversion(node_def, code, error_msg_substr);
-  if (code != error::OK) return;
-
-  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
-  CheckAddedLayers(test, /*expect_scale_layer=*/false);
-
-  TRT_TensorOrWeights output;
-  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
-  ASSERT_TRUE(output.is_tensor());
-
-  // Check the dims of the output ITensor.
-  std::vector<int> expected_output_dims = input_dims;
-  for (int i = expected_output_dims.size() - 1, j = weights_dims.size() - 1;
-       i >= 0 && j >= 0; --i, --j) {
-    if (expected_output_dims[i] == 1) {
-      expected_output_dims[i] = weights_dims[j];
-    }
-  }
-  ExpectTrtDimsEqualsArray(expected_output_dims,
-                           output.tensor()->getDimensions());
-
-  // Check the result of running the engine.
-  const int expected_num_outputs =
-      TrtTensorDimsNumElements(GetTestDims(expected_output_dims));
-  const DataVec input_data{
-      {"input", ConstructTensor<CType>(num_inputs, CType(2))}};
-  DataVec output_data{
-      {"my_binary", ConstructTensor<CType>(expected_num_outputs)}};
-  test->BuildAndRun(input_data, &output_data);
-  if (node_def.op() == "Add") {
-    EXPECT_THAT(
-        GetSpanForData<CType>(output_data[0]),
-        ElementsAreArray(std::vector<CType>(expected_num_outputs, CType(3))));
-  } else if (node_def.op() == "Minimum") {
-    EXPECT_THAT(
-        GetSpanForData<CType>(output_data[0]),
-        ElementsAreArray(std::vector<CType>(expected_num_outputs, CType(1))));
-  } else {
-    ASSERT_TRUE(false);
-  }
-}
-
-template <typename OpType, DataType dtype>
-void TestBinaryTensorOpTensor(OpConverterTest* test) {
+void TestBinaryOp(OpConverterTest* test, bool operand_1_is_tensor,
+                  bool operand_2_is_tensor) {
   typedef typename EnumToDataType<dtype>::Type CType;
   test->Reset();
   const NodeDef node_def =
       GetBinaryOpNodeDef<OpType>("input1", "input2", dtype);
-  test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/1,
-                      TfDataTypeToTrt(dtype));
-  test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/1,
-                      TfDataTypeToTrt(dtype));
+  if (operand_1_is_tensor) {
+    test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/2,
+                        TfDataTypeToTrt(dtype));
+  } else {
+    test->AddTestWeights("input1", /*dims=*/{1, 2},
+                         /*values=*/std::vector<CType>{CType(3), CType(6)});
+  }
+  if (operand_2_is_tensor) {
+    test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/2,
+                        TfDataTypeToTrt(dtype));
+  } else {
+    test->AddTestWeights("input2", /*dims=*/{2, 1},
+                         /*values=*/std::vector<CType>{CType(2), CType(3)});
+  }
   test->RunValidationAndConversion(node_def);
 
-  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
-  CheckAddedLayers(test, /*expect_scale_layer=*/false);
-
+  DataVec input_data;
+  if (operand_1_is_tensor) {
+    input_data.push_back(
+        {"input1",
+         test::AsTensor<CType>({CType(3), CType(6), CType(3), CType(6)})});
+  }
+  if (operand_2_is_tensor) {
+    input_data.push_back(
+        {"input2",
+         test::AsTensor<CType>({CType(2), CType(3), CType(2), CType(3)})});
+  }
+  DataVec output_data{{"my_binary", ConstructTensor<CType>(8)}};
   // Check output dims.
   TRT_TensorOrWeights output;
   TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
   ASSERT_TRUE(output.is_tensor());
   ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
-
-  const DataVec input_data{
-      {"input1", test::AsTensor<CType>({CType(3), CType(6)})},
-      {"input2", test::AsTensor<CType>({CType(2), CType(3)})}};
-  DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
   // After broadcasting first input becomes {3, 6, 3, 6} and second input
   // becomes {2, 3, 2, 3}.
   test->BuildAndRun(
       input_data, &output_data,
-      dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32);
+      dtype == DT_HALF ? TrtPrecisionMode::FP16 : TrtPrecisionMode::FP32,
+      /*batch_size=*/2);
   if (node_def.op() == "Add") {
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAre(CType(5), CType(8), CType(6), CType(9)));
+    EXPECT_THAT(
+        GetSpanForData<CType>(output_data[0]),
+        ElementsAreArray(CastTestVector<int, CType>({5, 8, 6, 9, 5, 8, 6, 9})));
   } else if (node_def.op() == "Sub") {
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAre(CType(1), CType(4), CType(0), CType(3)));
+    EXPECT_THAT(
+        GetSpanForData<CType>(output_data[0]),
+        ElementsAreArray(CastTestVector<int, CType>({1, 4, 0, 3, 1, 4, 0, 3})));
   } else if (node_def.op() == "Mul") {
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAre(CType(6), CType(12), CType(9), CType(18)));
+                ElementsAreArray(
+                    CastTestVector<int, CType>({6, 12, 9, 18, 6, 12, 9, 18})));
   } else if (node_def.op() == "Div") {
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+                ElementsAreArray(CastTestVector<float, CType>(
+                    {1.5, 3, 1, 2, 1.5, 3, 1, 2})));
   } else if (node_def.op() == "RealDiv") {
     EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+                ElementsAreArray(CastTestVector<float, CType>(
+                    {1.5, 3, 1, 2, 1.5, 3, 1, 2})));
   } else if (node_def.op() == "Minimum") {
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAre(CType(2), CType(2), CType(3), CType(3)));
+    EXPECT_THAT(
+        GetSpanForData<CType>(output_data[0]),
+        ElementsAreArray(CastTestVector<int, CType>({2, 2, 3, 3, 2, 2, 3, 3})));
   } else if (node_def.op() == "Maximum") {
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAre(CType(3), CType(6), CType(3), CType(6)));
+    EXPECT_THAT(
+        GetSpanForData<CType>(output_data[0]),
+        ElementsAreArray(CastTestVector<int, CType>({3, 6, 3, 6, 3, 6, 3, 6})));
   } else if (node_def.op() == "Pow") {
     ExpectArrayNear(
-        std::vector<CType>{CType(9), CType(36), CType(27), CType(216)},
+        CastTestVector<int, CType>({9, 36, 27, 216, 9, 36, 27, 216}),
         GetSpanForData<CType>(output_data[0]));
   } else {
     ASSERT_TRUE(false);
@@ -2287,58 +2125,48 @@ TEST_F(OpConverterTest, ConvertBinary) {
         "both input as constant at: my_add");
   }
 
-  // Test BinaryTensorOpWeight() without broadcasting.
-  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_FLOAT>(this);
-  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_FLOAT>(this);
-  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_FLOAT>(this);
-  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_FLOAT>(this);
-  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_FLOAT>(this);
-
-  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_HALF>(this);
-  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_HALF>(this);
-  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_HALF>(this);
-  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_HALF>(this);
-  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_HALF>(this);
-
-  // Test BinaryTensorOpWeight() with channel-wise broadcasting.
-  TestBinaryTensorOpWeightWithChannelWiseBroadcast<DT_FLOAT>(this);
-
-  // Test BinaryTensorOpWeight() with uniformly broadcasting.
-  TestBinaryTensorOpWeightWithUniformlyBroadcast<DT_FLOAT>(this);
-
-  // Test BinaryTensorOpWeight() falling back to BinaryTensorOpTensor().
-  // Unsupported op.
-  TestBinaryTensorOpWeightFallback<ops::Minimum>(this, {1, 1, 1}, {1});
-  // Rank of input tensor dimension <3.
-  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1}, {1});
-  // Broadcast on batch dimension, should fail.
-  TestBinaryTensorOpWeightFallback<ops::Add>(
-      this, {1, 1, 1}, {2, 1, 1, 1}, error::INVALID_ARGUMENT,
-      "Unsupported binary op broadcast scheme for op my_binary",
-      /*input_batch_size=*/2);
-  // Incompatible dims with per-channel mode.
-  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1, 1}, {1, 2, 1});
-  // Incompatible dims.
-  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 2, 1}, {2});
-
-  // Test BinaryTensorOpTensor() with broadcasting.
-  TestBinaryTensorOpTensor<ops::Add, DT_FLOAT>(this);
-  TestBinaryTensorOpTensor<ops::Sub, DT_FLOAT>(this);
-  TestBinaryTensorOpTensor<ops::Mul, DT_FLOAT>(this);
-  TestBinaryTensorOpTensor<ops::Div, DT_FLOAT>(this);
-  TestBinaryTensorOpTensor<ops::RealDiv, DT_FLOAT>(this);
-  TestBinaryTensorOpTensor<ops::Minimum, DT_FLOAT>(this);
-  TestBinaryTensorOpTensor<ops::Maximum, DT_FLOAT>(this);
-  TestBinaryTensorOpTensor<ops::Pow, DT_FLOAT>(this);
-
-  TestBinaryTensorOpTensor<ops::Add, DT_HALF>(this);
-  TestBinaryTensorOpTensor<ops::Sub, DT_HALF>(this);
-  TestBinaryTensorOpTensor<ops::Mul, DT_HALF>(this);
-  TestBinaryTensorOpTensor<ops::Div, DT_HALF>(this);
-  TestBinaryTensorOpTensor<ops::RealDiv, DT_HALF>(this);
-  TestBinaryTensorOpTensor<ops::Minimum, DT_HALF>(this);
-  TestBinaryTensorOpTensor<ops::Maximum, DT_HALF>(this);
-  TestBinaryTensorOpTensor<ops::Pow, DT_HALF>(this);
+  // Test combinations of tensor vs weight inputs (except when both inputs are
+  // weights).
+  for (const bool operand_1_is_tensor : {true, false}) {
+    for (const bool operand_2_is_tensor : {true, false}) {
+      if (!operand_1_is_tensor && !operand_2_is_tensor) continue;
+      // FP32 tests
+      TestBinaryOp<ops::Add, DT_FLOAT>(this, operand_1_is_tensor,
+                                       operand_2_is_tensor);
+      TestBinaryOp<ops::Sub, DT_FLOAT>(this, operand_1_is_tensor,
+                                       operand_2_is_tensor);
+      TestBinaryOp<ops::Mul, DT_FLOAT>(this, operand_1_is_tensor,
+                                       operand_2_is_tensor);
+      TestBinaryOp<ops::Div, DT_FLOAT>(this, operand_1_is_tensor,
+                                       operand_2_is_tensor);
+      TestBinaryOp<ops::RealDiv, DT_FLOAT>(this, operand_1_is_tensor,
+                                           operand_2_is_tensor);
+      TestBinaryOp<ops::Minimum, DT_FLOAT>(this, operand_1_is_tensor,
+                                           operand_2_is_tensor);
+      TestBinaryOp<ops::Maximum, DT_FLOAT>(this, operand_1_is_tensor,
+                                           operand_2_is_tensor);
+      TestBinaryOp<ops::Pow, DT_FLOAT>(this, operand_1_is_tensor,
+                                       operand_2_is_tensor);
+      // FP16 tests
+      // TODO(tmorris): Use templates to avoid duplication.
+      TestBinaryOp<ops::Add, DT_HALF>(this, operand_1_is_tensor,
+                                      operand_2_is_tensor);
+      TestBinaryOp<ops::Sub, DT_HALF>(this, operand_1_is_tensor,
+                                      operand_2_is_tensor);
+      TestBinaryOp<ops::Mul, DT_HALF>(this, operand_1_is_tensor,
+                                      operand_2_is_tensor);
+      TestBinaryOp<ops::Div, DT_HALF>(this, operand_1_is_tensor,
+                                      operand_2_is_tensor);
+      TestBinaryOp<ops::RealDiv, DT_HALF>(this, operand_1_is_tensor,
+                                          operand_2_is_tensor);
+      TestBinaryOp<ops::Minimum, DT_HALF>(this, operand_1_is_tensor,
+                                          operand_2_is_tensor);
+      TestBinaryOp<ops::Maximum, DT_HALF>(this, operand_1_is_tensor,
+                                          operand_2_is_tensor);
+      TestBinaryOp<ops::Pow, DT_HALF>(this, operand_1_is_tensor,
+                                      operand_2_is_tensor);
+    }
+  }
 }
 
 TEST_F(OpConverterTest, ConvertQuantize) {
@@ -2583,7 +2411,6 @@ TEST_F(OpConverterTest, ConvertCombinedNMS) {
       // implementation that, the extra output classes that are outside of the
       // range specified by valid_detections[i] are not zeros but -1s.
       TestParams{{1, 1, 4}, {1, 3}, 3, 2, .5f, 0, {2, 4}, {2}, {2}}};
-  const int batch_size = 1;
 
   for (int i = 0; i < kCombinedNMSOKCases; ++i) {
     Reset();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index d325d11dfff..0e5ecc72c60 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -14,6 +14,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h"
 
+#include "absl/strings/ascii.h"
+#include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
@@ -32,9 +34,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 // TODO(sami): Remove VLOG messages once the code matures
+using absl::AsciiStrToUpper;
 using absl::StrAppend;
 using absl::StrCat;
-using str_util::Uppercase;
 
 Status TRTOptimizationPass::Init(
     const RewriterConfig_CustomGraphOptimizer* config) {
@@ -67,7 +69,7 @@ Status TRTOptimizationPass::Init(
   }
   if (params.count("precision_mode")) {
     TF_RETURN_IF_ERROR(TrtPrecisionModeFromName(
-        Uppercase(params.at("precision_mode").s()), &precision_mode_));
+        AsciiStrToUpper(params.at("precision_mode").s()), &precision_mode_));
   }
   if (params.count("use_calibration")) {
     use_calibration_ = params.at("use_calibration").b();
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
index ec038ebda07..d54cbf7836e 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <dirent.h>
 #include <string.h>
+
 #include <fstream>
 #include <vector>
 
@@ -68,9 +69,9 @@ TEST_F(GetSerializedResourceOpTest, Basic) {
   TF_ASSERT_OK(RunOpKernel());
 
   // Verify the result.
-  // TODO(laigd): OpsTestBase::GetOutput() doesn't work.
-  Tensor* output = context_->mutable_output(0);
-  EXPECT_EQ("my_serialized_str", output->scalar<string>()());
+  // string type output will remain on CPU, so we're not using GetOutput() here.
+  EXPECT_EQ("my_serialized_str",
+            context_->mutable_output(0)->scalar<string>()());
 }
 
 }  // namespace tensorrt
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index b62fdc5dc4b..d4077692235 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -87,16 +87,11 @@ TYPED_TEST(TRTEngineOpTest, Basic) {
   TF_ASSERT_OK(OpsTestBase::RunOpKernel());
 
   // Verify the result.
-  // TODO(laigd): OpsTestBase::GetOutput() doesn't work.
-  Tensor* output = OpsTestBase::context_->mutable_output(0);
-  const auto& tensor_map = output->flat<TypeParam>();
-  std::vector<TypeParam> output_data(tensor_map.size());
-  ASSERT_EQ(0, cudaDeviceSynchronize());
-  ASSERT_EQ(0, cudaMemcpy(output_data.data(), tensor_map.data(),
-                          sizeof(TypeParam) * tensor_map.size(),
-                          cudaMemcpyDeviceToHost));
-  EXPECT_THAT(absl::Span<const TypeParam>(output_data),
-              ElementsAre(TypeParam(0.0f), TypeParam(2.0f)));
+  Tensor* output = OpsTestBase::GetOutput(0);
+  EXPECT_THAT(
+      absl::Span<const TypeParam>(output->template flat<TypeParam>().data(),
+                                  output->NumElements()),
+      ElementsAre(TypeParam(0.0f), TypeParam(2.0f)));
 }
 
 }  // namespace tensorrt
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 5d9a1b25210..932966534b7 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -681,31 +681,33 @@ Status SegmentGraph(const Graph* tf_graph,
               << " with parent=" << segment_root << ":" << s;
     }
 
-    // Don't use small segments.
-    if (static_cast<int>(segment_nodes.size()) < options.minimum_segment_size) {
+    const int num_effective_nodes = std::count_if(
+        segment_nodes.begin(), segment_nodes.end(), [](const Node* node) {
+          static auto noops =
+              new std::set<string>{"Identity", "Snapshot", "StopGradient"};
+          return noops->count(node->type_string()) == 0;
+        });
+
+    // Don't use segments whose number of effective nodes is small.
+    if (num_effective_nodes < options.minimum_segment_size) {
       VLOG(1) << "Segment " << segments->size() << " has only "
-              << segment_nodes.size() << " nodes, dropping";
+              << num_effective_nodes << " effective nodes, dropping";
       continue;
     }
 
-    // TODO(sami): Make segmenter placement aware once trtscopes are in place
     const auto& dev_itr = device_maps.find(segment_root);
     if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
       VLOG(1) << "No device assigned to segment " << segments->size();
-      segments->emplace_back(std::make_pair(segment_nodes, string()));
     } else if (dev_itr->second.size() > 1) {
-      string s("Segment ");
-      StrAppend(&s, segments->size(), " has multiple devices attached: ");
+      string s = StrCat("Segment ", segments->size(),
+                        " has multiple devices attached: ");
       for (const auto& dev : dev_itr->second) {
         StrAppend(&s, dev, ", ");
       }
-      LOG(WARNING) << s << " choosing " << *(dev_itr->second.begin());
-      segments->emplace_back(
-          std::make_pair(segment_nodes, *(dev_itr->second.begin())));
-    } else {
-      segments->emplace_back(
-          std::make_pair(segment_nodes, *(dev_itr->second.begin())));
+      LOG(WARNING) << s;
     }
+
+    segments->emplace_back(segment_nodes);
   }
   if (VLOG_IS_ON(1)) {
     for (const auto& d : device_maps) {
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index e31f1a989d9..77c0af223c8 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -31,10 +31,8 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// Vector of segments, each entry contains a set of node pointers and a device
-// name in the segment.
-using SegmentNodesVector =
-    std::vector<std::pair<std::set<const Node*>, string>>;
+// Vector of segments, each entry contains a set of node pointers.
+using SegmentNodesVector = std::vector<std::set<const Node*>>;
 
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index 84b690ecba6..cb038e58126 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -77,7 +77,7 @@ class SegmentTest : public ::testing::Test {
     EXPECT_EQ(expected_segments.size(), segments.size());
     for (int i = 0; i < segments.size(); ++i) {
       std::set<string> segment_node_names;
-      for (const Node* node : segments[i].first) {
+      for (const Node* node : segments[i]) {
         segment_node_names.insert(node->name());
       }
       const auto& expected = expected_segments[i];
@@ -262,6 +262,23 @@ TEST_F(SegmentTest, BigIfElse) {
           {{"add0", "add1"}, {"add3", "add4", "add5", "add6", "add7"}});
 }
 
+TEST_F(SegmentTest, IdentityOps) {
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto identity0 = ops::Identity(s.WithOpName("identity0"), feed);
+  auto identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
+  auto identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
+  auto identity3 = ops::Identity(s.WithOpName("identity3"), identity2);
+  Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  const std::set<string> all_identities = {"identity0", "identity1",
+                                           "identity2", "identity3"};
+  // Identity ops are not counted as effective ops in the segment, so no segment
+  // will be formed in this case.
+  RunTest(&g, all_identities, all_identities, all_identities, {});
+}
+
 }  // namespace test
 }  // namespace segment
 }  // namespace tensorrt
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index dcce43cbe70..2bc8ab45a51 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,7 +1,10 @@
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_cuda_cc_test")
 
+package(
+    default_visibility = [":internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
 package_group(
     name = "internal",
     packages = [
@@ -23,15 +26,12 @@ package_group(
     ],
 )
 
-package(
-    default_visibility = [":internal"],
-)
-
 load(
     "//tensorflow/core:platform/default/cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library", "xla_py_proto_library")
+load("//tensorflow:tensorflow.bzl", "tf_portable_proto_library")
 
 cc_library(
     name = "tf2xla_supported_ops_lib",
@@ -67,6 +67,19 @@ xla_proto_library(
     ],
 )
 
+# A proto library that is minimal in size and dependencies for platforms like Android.
+tf_portable_proto_library(
+    name = "portable_tf2xla_proto",
+    config_string = "allow_all:true",
+    header_outs = ["//tensorflow/compiler/tf2xla/tf2xla.proto.h"],
+    portable_deps = ["//tensorflow/core:android_proto_lib"],
+    proto_deps = [
+        ":tf2xla_proto",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 xla_py_proto_library(
     name = "tf2xla_py",
     has_services = False,
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index adcdb6c8f76..fb7c8c56ac7 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
 
 tf_gen_op_wrapper_cc(
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index 6e093400e47..3aaa2eed432 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -918,10 +918,16 @@ string Conditional::name() const {
 
 Status FunctionalizeCond::AddIdentityNode(const Node* replacee, Node* if_node,
                                           int port) {
+  NodeBuilder id_builder(replacee->name(), "Identity");
+  id_builder.Input(if_node, port);
+  string outside_compilation;
+  if (GetNodeAttr(if_node->def(), kXlaOutsideCompilationAttrName,
+                  &outside_compilation)
+          .ok()) {
+    id_builder.Attr(kXlaOutsideCompilationAttrName, outside_compilation);
+  }
   Node* id;
-  TF_RETURN_IF_ERROR(NodeBuilder(replacee->name(), "Identity")
-                         .Input(if_node, port)
-                         .Finalize(graph_, &id));
+  TF_RETURN_IF_ERROR(id_builder.Finalize(graph_, &id));
   state_map_.ResetCondId(id, state_map_.LookupCondId(if_node));
   state_map_.ResetAncestorId(id, state_map_.LookupAncestorId(if_node));
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 89d5a860179..294a104b3b5 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -247,8 +247,8 @@ Status FunctionalizeControlFlowPass::Run(
   // multiple times, and we want to avoid functionalize it again.
   static std::map<string, string>* kNodeTypeToFunctionAttrMapping =
       new std::map<string, string>{
-          // TPUReplicate ops are generated by EncapsulateTPUComputationsPass.
-          {"TPUReplicate", "computation"},
+          // _TPUReplicate ops are generated by EncapsulateTPUComputationsPass.
+          {"_TPUReplicate", "computation"},
           // XlaLaunch ops are generated by EncapsulateXlaComputationsPass.
           {"XlaLaunch", "function"},
       };
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d6dfa39e658..06376f7174e 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -1,9 +1,8 @@
 load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_kernel_library")
 
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 tf_kernel_library(
@@ -195,6 +194,7 @@ tf_kernel_library(
         "//tensorflow/core/kernels:training_ops",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/kernels/assert_op.cc b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
index af4ab5e8ef6..94543686b47 100644
--- a/tensorflow/compiler/tf2xla/kernels/assert_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
@@ -43,7 +43,7 @@ class AssertOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(AssertOp);
 };
 
-REGISTER_XLA_OP(Name("Assert"), AssertOp);
+REGISTER_XLA_OP(Name("Assert").CompilationOnly(), AssertOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 84eda80fc25..013a5734863 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -39,7 +39,10 @@ class FusedBatchNormOp : public XlaOpKernel {
     is_on_gpu_ = ctx->device_type().type_string() == DEVICE_GPU_XLA_JIT;
   }
 
-  void Compile(XlaOpKernelContext* ctx) override {
+  void Compile(XlaOpKernelContext* ctx) override { CompileImpl(ctx); }
+
+ protected:
+  virtual void CompileImpl(XlaOpKernelContext* ctx) {
     xla::PrimitiveType input_type;
     OP_REQUIRES_OK(ctx,
                    DataTypeToPrimitiveType(ctx->input_type(0), &input_type));
@@ -116,8 +119,29 @@ class FusedBatchNormOp : public XlaOpKernel {
   bool is_on_gpu_;
 };
 
+class FusedBatchNormOpV3 : public FusedBatchNormOp {
+ public:
+  explicit FusedBatchNormOpV3(OpKernelConstruction* ctx)
+      : FusedBatchNormOp(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    FusedBatchNormOp::CompileImpl(ctx);
+    if (!ctx->status().ok()) {
+      return;
+    }
+    ctx->SetConstantOutput(5, Tensor());
+  }
+
+ private:
+  float epsilon_;
+  TensorFormat data_format_;
+  bool is_training_;
+  bool is_on_gpu_;
+};
+
 REGISTER_XLA_OP(Name("FusedBatchNorm"), FusedBatchNormOp);
 REGISTER_XLA_OP(Name("FusedBatchNormV2"), FusedBatchNormOp);
+REGISTER_XLA_OP(Name("FusedBatchNormV3"), FusedBatchNormOpV3);
 
 class FusedBatchNormGradOp : public XlaOpKernel {
  public:
@@ -233,6 +257,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("FusedBatchNormGrad"), FusedBatchNormGradOp);
 REGISTER_XLA_OP(Name("FusedBatchNormGradV2"), FusedBatchNormGradOp);
+REGISTER_XLA_OP(Name("FusedBatchNormGradV3"), FusedBatchNormGradOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index d801d560040..258d8f75cde 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -150,6 +151,15 @@ class ExtractImagePatchesOp : public XlaOpKernel {
     xla::XlaOp conv =
         xla::ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding,
                                 lhs_dilation, rhs_dilation, dims, depth);
+    // Feature group convolution, will end up with the kernel_size change more
+    // rapidly than the depth. Reshape, transpose and reshape to reorder them.
+    auto conv_dims = builder->GetShape(conv).ValueOrDie().dimensions();
+    conv_dims.back() = depth;
+    conv_dims.push_back(kernel_size);
+    conv = xla::TransposeInMinorDims(xla::Reshape(conv, conv_dims));
+    conv_dims.pop_back();
+    conv_dims.back() *= kernel_size;
+    conv = xla::Reshape(conv, conv_dims);
     ctx->SetOutput(0, conv);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 6472045265e..489ffd3fdad 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -20,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -148,15 +152,22 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
 
 class GatherOp : public XlaOpKernel {
  public:
-  explicit GatherOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+  explicit GatherOp(OpKernelConstruction* context) : XlaOpKernel(context) {
+    // Set batch_dims_ to 0 if the attribute does not exist.
+    if (context->HasAttr("batch_dims")) {
+      OP_REQUIRES_OK(context, context->GetAttr("batch_dims", &batch_dims_));
+    } else {
+      batch_dims_ = 0;
+    }
+  }
 
   void Compile(XlaOpKernelContext* context) override {
-    xla::XlaBuilder* builder = context->builder();
     auto input = context->Input(0);
     auto input_shape = context->InputShape(0);
     auto indices = context->Input(1);
     auto indices_shape = context->InputShape(1);
-    int64 axis = 0;
+
+    absl::optional<int64> axis;
     if (context->num_inputs() == 3) {
       const TensorShape axis_shape = context->InputShape(2);
       OP_REQUIRES(context, TensorShapeUtils::IsScalar(axis_shape),
@@ -165,31 +176,73 @@ class GatherOp : public XlaOpKernel {
       OP_REQUIRES(context, axis_type == DT_INT32 || axis_type == DT_INT64,
                   errors::InvalidArgument("axis must be int32 or int64"));
 
-      OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &axis));
+      int64 axis_input;
+      OP_REQUIRES_OK(context,
+                     context->ConstantInputAsIntScalar(2, &axis_input));
+
       const auto params_dims = input_shape.dims();
-      OP_REQUIRES(
-          context, -params_dims <= axis && axis < params_dims,
-          errors::InvalidArgument("Expected axis in the range [", -params_dims,
-                                  ", ", params_dims, "), but got ", axis));
-      if (axis < 0) {
-        axis += params_dims;
+      OP_REQUIRES(context,
+                  -params_dims <= axis_input && axis_input < params_dims,
+                  errors::InvalidArgument("Expected axis in the range [",
+                                          -params_dims, ", ", params_dims,
+                                          "), but got ", axis_input));
+      if (axis_input < 0) {
+        axis_input += params_dims;
       }
+      axis = axis_input;
     }
 
+    if (batch_dims_ != 0) {
+      if (batch_dims_ < 0) {
+        batch_dims_ = indices_shape.dims() + batch_dims_;
+      }
+
+      axis = axis.value_or(batch_dims_);
+
+      OP_REQUIRES(context,
+                  batch_dims_ >= -indices_shape.dims() &&
+                      batch_dims_ < indices_shape.dims(),
+                  errors::InvalidArgument("Expected batch_dims in the range [",
+                                          -indices_shape.dims(), ", ",
+                                          indices_shape.dims(), "), but got ",
+                                          batch_dims_));
+
+      OP_REQUIRES(context, batch_dims_ < input_shape.dims(),
+                  errors::InvalidArgument("batch_dims (", batch_dims_,
+                                          ") must be less than rank(input) (",
+                                          input_shape.dims(), ")."));
+
+      OP_REQUIRES(context, *axis >= batch_dims_,
+                  errors::InvalidArgument("batch_dims (", batch_dims_,
+                                          ") must be less than or equal to ",
+                                          "axis (", *axis, ")."));
+    }
+
+    axis = axis.value_or(0);
     DataType index_type = input_type(1);
     OP_REQUIRES(context, index_type == DT_INT32 || index_type == DT_INT64,
                 errors::InvalidArgument("indices must be int32 or int64"));
 
     xla::XlaOp gather;
-    OP_REQUIRES_OK(
-        context, XlaGather(input, input_shape, indices, indices_shape, axis,
-                           /*indices_are_nd=*/false, input_type(0), index_type,
-                           builder, &gather));
+    if (batch_dims_ > 0) {
+      gather = xla::TorchIndexSelect(input, indices, *axis, batch_dims_);
+    } else {
+      // XlaGather() manages degenerate cases, like empty-indices, which are
+      // error conditions and caught above if batch_dims is not 0.
+      OP_REQUIRES_OK(
+          context, XlaGather(input, input_shape, indices, indices_shape, *axis,
+                             /*indices_are_nd=*/false, input_type(0),
+                             index_type, context->builder(), &gather));
+    }
     context->SetOutput(0, gather);
   }
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(GatherOp);
+
+  // The number of batch dimensions, as passed in the batch_dims attribute.
+  // It must be less than rank(indices).
+  int32 batch_dims_ = 0;
 };
 
 REGISTER_XLA_OP(Name("Gather"), GatherOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc b/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
index 9c6fcf429d4..246d3f6da94 100644
--- a/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/in_topk_op.cc
@@ -81,20 +81,21 @@ class InTopKOp : public XlaOpKernel {
         xla::CreateScalarAddComputation(xla::F32, xla_builder), {1});
 
     // Calculate in each row of `predictions`, how many values are larger than
-    // the value of target class. Then return the result whether the count <= k,
+    // the value of target class. Then return the result whether the count < k,
     // which indicates the target is in topk.
-    xla::XlaOp ge_r2 = xla::Ge(predictions_r2, targets_values_r1, {0});
+    xla::XlaOp gt_r2 = xla::Gt(predictions_r2, targets_values_r1, {0});
     xla::XlaOp zero_r0 = xla::Zero(xla_builder, xla::S32);
     xla::XlaOp zero_r2 = xla::Broadcast(zero_r0, predictions_shape.dim_sizes());
     xla::XlaOp one_r0 = xla::One(xla_builder, xla::S32);
     xla::XlaOp one_r2 = xla::Broadcast(one_r0, predictions_shape.dim_sizes());
-    xla::XlaOp one_hot_r2 = xla::Select(ge_r2, one_r2, zero_r2);
-    xla::XlaOp num_ge_r1 = xla::Reduce(
+    xla::XlaOp one_hot_r2 = xla::Select(gt_r2, one_r2, zero_r2);
+    xla::XlaOp num_gt_r1 = xla::Reduce(
         one_hot_r2, zero_r0,
         xla::CreateScalarAddComputation(xla::S32, xla_builder), {1});
 
     xla::XlaOp result =
-        xla::Le(num_ge_r1, xla::ConstantR0<int32>(xla_builder, k));
+        xla::And(xla::Lt(num_gt_r1, xla::ConstantR0<int32>(xla_builder, k)),
+                 xla::IsFinite(targets_values_r1));
 
     context->SetOutput(0, result);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index f36e0025250..a3fcb4d4b8f 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -67,9 +67,9 @@ class MatMulOp : public XlaOpKernel {
 
     OP_REQUIRES(ctx,
                 a_shape.dim_size(first_index) == b_shape.dim_size(second_index),
-                errors::InvalidArgument("Matrix size-compatible: In[0]: ",
-                                        a_shape.DebugString(), ", In[1]: ",
-                                        b_shape.DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size-incompatible: In[0]: ", a_shape.DebugString(),
+                    ", In[1]: ", b_shape.DebugString()));
 
     xla::XlaOp a = ctx->Input(0);
     xla::XlaOp b = ctx->Input(1);
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index ed303ba2774..70e4f96c0da 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <numeric>
 
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 namespace {
@@ -77,5 +79,58 @@ class SelectOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("Select"), SelectOp);
 
+class SelectOpV2 : public XlaOpKernel {
+ public:
+  explicit SelectOpV2(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape cond_shape = ctx->InputShape(0);
+    const TensorShape then_shape = ctx->InputShape(1);
+    const TensorShape else_shape = ctx->InputShape(2);
+
+    // Compute the output shape from the broadcast of the two data inputs, with
+    // the broadcast of the conditional.
+    // Then Broadcast all three inputs to the output shape and emit a select.
+
+    BCast bcast_then_else(BCast::FromShape(then_shape),
+                          BCast::FromShape(else_shape),
+                          /*fewer_dims_optimization=*/false);
+    if (!bcast_then_else.IsValid()) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "Incompatible shapes: ", then_shape.DebugString(), " vs. ",
+          else_shape.DebugString()));
+      return;
+    }
+    BCast bcast(bcast_then_else.output_shape(), BCast::FromShape(cond_shape),
+                /*fewer_dims_optimization=*/false);
+    if (!bcast.IsValid()) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "Incompatible shapes: ",
+          BCast::ToShape(bcast_then_else.output_shape()).DebugString(), " vs. ",
+          cond_shape.DebugString()));
+      return;
+    }
+
+    auto bcasted_cond = BroadcastTo(ctx->Input(0), bcast.output_shape());
+    OP_REQUIRES_OK(ctx, bcasted_cond.status());
+    auto cond_handle = bcasted_cond.ValueOrDie();
+
+    auto bcasted_then = BroadcastTo(ctx->Input(1), bcast.output_shape());
+    OP_REQUIRES_OK(ctx, bcasted_then.status());
+    auto then_handle = bcasted_then.ValueOrDie();
+
+    auto bcasted_else = BroadcastTo(ctx->Input(2), bcast.output_shape());
+    OP_REQUIRES_OK(ctx, bcasted_else.status());
+    auto else_handle = bcasted_else.ValueOrDie();
+
+    ctx->SetOutput(0, xla::Select(cond_handle, then_handle, else_handle));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(SelectOpV2);
+};
+
+REGISTER_XLA_OP(Name("SelectV2"), SelectOpV2);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index 20da8033536..dc1b0c21096 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // XLA-specific Ops for softmax.
 
 #include "absl/strings/match.h"
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -145,23 +146,36 @@ class SoftmaxXentWithLogitsOp : public XlaOpKernel {
       : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape logits_shape = ctx->InputShape(0);
-    const TensorShape labels_shape = ctx->InputShape(1);
-    OP_REQUIRES(ctx, logits_shape.IsSameSize(labels_shape),
-                errors::InvalidArgument(
-                    "logits and labels must be same size: logits_size=",
-                    logits_shape.DebugString(),
-                    " labels_size=", labels_shape.DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_shape),
-                errors::InvalidArgument("logits must be 2-dimensional"));
-    // As we already tested that both inputs have the same shape no need to
-    // check that "labels" is a matrix too.
-
     const DataType type = input_type(0);
     const xla::PrimitiveType xla_type = ctx->input_xla_type(0);
     auto logits = ctx->Input(0);
     auto labels = ctx->Input(1);
 
+    const TensorShape logits_shape = ctx->InputShape(0);
+    const TensorShape labels_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_shape),
+                errors::InvalidArgument("logits must be 2-dimensional"));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(labels_shape),
+                errors::InvalidArgument("labels must be 2-dimensional"));
+
+    // Confirm that any necessary broadcasting to make the shapes the same will
+    // succeed.
+    for (int dim = 0; dim < 2; dim++) {
+      OP_REQUIRES(
+          ctx,
+          labels_shape.dim_size(dim) == 1 ||
+              logits_shape.dim_size(dim) == labels_shape.dim_size(dim),
+          errors::InvalidArgument("logits and labels must be same size after "
+                                  "broadcasting of labels: logits_size=",
+                                  logits_shape.DebugString(),
+                                  " labels_size=", labels_shape.DebugString()));
+    }
+    if (!logits_shape.IsSameSize(labels_shape)) {
+      auto labels_or = BroadcastTo(labels, logits_shape.dim_sizes());
+      OP_REQUIRES_OK(ctx, labels_or.status());
+      labels = labels_or.ConsumeValueOrDie();
+    }
+
     xla::XlaOp loss, backprop;
     std::tie(loss, backprop) =
         CrossEntropyWithLogits(ctx, type, xla_type, logits, labels);
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index f9ce50be6e3..5b1f92b24c8 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -1,9 +1,8 @@
 # Utilities for building XLA computations.
 
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = ["//tensorflow/compiler/tf2xla:friends"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 # Filegroup used to collect source files for dependency checking.
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 4f1f3d7c326..17a62e83d5f 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index c6f57b386eb..c731d52ea2b 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -1,9 +1,8 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [
         "//visibility:public",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 load(
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index b8eda1de94a..dcdf5acdace 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -550,6 +550,7 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   };
   GraphOptimizer::Options graph_optimizer_options;
   graph_optimizer_options.cf_consider_fn = cf_consider_fn;
+  graph_optimizer_options.inline_multi_device_functions = true;
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
                      /*device=*/nullptr, &graph, graph_optimizer_options);
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 95d1bf25150..7c6c53a225f 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -116,9 +116,12 @@ class XlaOpRegistry {
     // If we should cluster operations returning DT_VARIANT.
     bool cluster_variant_ops = false;
 
-    // Whether ops known to be slow or to have correctness issues should be
+    // Whether ops known to be slow should be auto-clustered.
+    bool cluster_slow_ops = false;
+
+    // Whether ops known to have numerical accuracy issues should be
     // auto-clustered.
-    bool cluster_slow_and_inaccurate_ops = false;
+    bool cluster_inaccurate_ops = false;
   };
 
   // Registers an XLA backend. `compilation_device_name` is the name of the
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 91f33ff914e..60c8c857f0e 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
@@ -575,6 +576,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -881,6 +883,26 @@ tf_cc_test(
         ],
 )
 
+cc_library(
+    name = "refcounting_hash_map",
+    hdrs = ["refcounting_hash_map.h"],
+    deps = [
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tf_cc_test(
+    name = "refcounting_hash_map_test",
+    srcs = ["refcounting_hash_map_test.cc"],
+    deps = [
+        ":refcounting_hash_map",
+        ":test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index b800229bd90..806521756dc 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   XLA client libraries.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 4a99debbe70..acf59c47f3c 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -2,9 +2,10 @@
 
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow/compiler/xla/client:friends"])
+package(
+    default_visibility = ["//tensorflow/compiler/xla/client:friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -472,11 +473,6 @@ cc_library(
 xla_test(
     name = "svd_test",
     srcs = ["svd_test.cc"],
-    # Blacklisted because the tests are flaky.
-    blacklisted_backends = [
-        "cpu",
-        "gpu",
-    ],
     real_hardware_only = True,
     shard_count = 10,
     tags = ["optonly"],
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index 93f3d3ab131..902269d9412 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -46,23 +46,34 @@ XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
   return ConvertElementType(indicator, type);
 }
 
+XlaOp GetDiagonalMask(XlaOp x, int diagonal) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    auto n_dims = static_cast<int32>(shape.rank());
+    TF_RET_CHECK(n_dims >= 2);
+    auto m = shape.dimensions(n_dims - 2);
+    auto n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, S32, n);
+    auto b = Iota(builder, S32, m) + ConstantR0WithType(builder, S32, diagonal);
+    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    auto mask = Broadcast(indicator, major_dims);
+    return mask;
+  });
+}
+
 XlaOp GetMatrixDiagonal(XlaOp x, int k) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = shape.rank();
+    auto n_dims = static_cast<int32>(shape.rank());
     TF_RET_CHECK(n_dims >= 2);
     const int64 m = shape.dimensions(n_dims - 2);
     const int64 n = shape.dimensions(n_dims - 1);
 
-    auto offset = ConstantR0WithType(builder, S32, k);
-
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, S32, n);
-    auto b = Iota(builder, S32, m) + offset;
-    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    auto mask = Broadcast(indicator, major_dims);
+    auto mask = GetDiagonalMask(x, k);
 
     // TPUs don't support S64 add reduction at the moment. But fortunately
     // OR-reductions work just as well for integers.
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
index 5f1ca964a41..541ce2897f5 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -31,6 +31,10 @@ namespace xla {
 // else.
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
 
+// Returns a mask where the 'diagonal'-th diagonal is true and everything else
+// is false.
+XlaOp GetDiagonalMask(XlaOp x, int diagonal = 0);
+
 // Get the diagonals of the last two dimensions. Use k>0 for diagonals above the
 // main diagonal, and k<0 for diagonals below the main diagonal.
 //
diff --git a/tensorflow/compiler/xla/client/lib/svd.cc b/tensorflow/compiler/xla/client/lib/svd.cc
index 53a23872709..646875a20a2 100644
--- a/tensorflow/compiler/xla/client/lib/svd.cc
+++ b/tensorflow/compiler/xla/client/lib/svd.cc
@@ -75,11 +75,6 @@ struct OneSidedJacobiRotation {
   JacobiRotation rot_r;
 };
 
-struct FrobeniusNorms {
-  XlaOp off_diagonal_norm;
-  XlaOp total_norm;
-};
-
 // Householder reflection on the trailing elements of a vector.
 //
 // H = I - beta * [1, v]' * [1, v]
@@ -567,27 +562,26 @@ StatusOr<SVDResult> OneSidedJacobiUpdate(SVDResult svd_result, XlaOp p, XlaOp q,
   return svd_result;
 }
 
-StatusOr<FrobeniusNorms> ComputeFrobeniusNorms(XlaOp w) {
+StatusOr<XlaOp> ComputeToleranceComparison(XlaOp w, XlaOp epsilon) {
   XlaBuilder* builder = w.builder();
   TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(w));
-  const int64 num_dims = shape.rank();
-  auto frobenius_norm =
-      Sqrt(Reduce(Square(w), ScalarLike(w, 0.0),
-                  CreateScalarAddComputation(shape.element_type(), builder),
-                  {num_dims - 2, num_dims - 1}));
-  auto diag = GetMatrixDiagonal(w);
-  auto diag_square =
-      Reduce(Square(diag), ScalarLike(w, 0.0),
-             CreateScalarAddComputation(shape.element_type(), builder),
-             {num_dims - 2});
-
-  FrobeniusNorms frobenius_norms;
-
-  frobenius_norms.off_diagonal_norm =
-      Sqrt(Max(Square(frobenius_norm) - diag_square, ScalarLike(w, 0.0)));
-  frobenius_norms.total_norm = frobenius_norm;
-
-  return frobenius_norms;
+  auto num_dims = static_cast<int32>(shape.rank());
+  int64 n = shape.dimensions(num_dims - 1);
+  shape.set_dimensions(num_dims - 2, n);
+  auto w_sliced = SliceInMinorDims(w, {0, 0}, {n, n});
+  auto diag = GetMatrixDiagonal(w_sliced);
+  diag = Select(Lt(diag, ZerosLike(diag)), -diag, diag);
+  std::vector<int64> broadcasted_dims(num_dims - 1);
+  std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+  auto broadcast_to_rows =
+      BroadcastInDim(diag, shape.dimensions(), broadcasted_dims);
+  broadcasted_dims.back() = num_dims - 1;
+  auto broadcast_to_columns =
+      BroadcastInDim(diag, shape.dimensions(), broadcasted_dims);
+  // Compute w_{i,i} * w_{j,j} * epsilon^2 < (w_{i,j})^2
+  return Lt(
+      broadcast_to_rows * broadcast_to_columns * epsilon * epsilon,
+      Square(Select(GetDiagonalMask(w_sliced), ZerosLike(w_sliced), w_sliced)));
 }
 
 // Main boby of One-sided Jacobi Method.
@@ -603,13 +597,13 @@ StatusOr<std::vector<XlaOp>> WhileLoopFn(
     auto max_sweeps = ScalarLike(k, max_sweep_updates);
     auto sweep_update_cond = Gt(max_sweeps, k);
 
-    auto norms = ComputeFrobeniusNorms(values[3]).ValueOrDie();
-    auto tol = norms.total_norm * values[4];
-    auto tol_cond = ReduceAll(Lt(tol, norms.off_diagonal_norm),
-                              xla::ConstantR0<bool>(cond_builder, false),
-                              CreateScalarOrComputation(PRED, cond_builder));
+    TF_ASSIGN_OR_RETURN(auto tolerance_comparison,
+                        ComputeToleranceComparison(values[3], values[4]));
+    auto tolerance_cond = ReduceAll(
+        tolerance_comparison, xla::ConstantR0<bool>(cond_builder, false),
+        CreateScalarOrComputation(PRED, cond_builder));
 
-    return And(sweep_update_cond, tol_cond);
+    return And(sweep_update_cond, tolerance_cond);
   };
 
   auto while_body_fn =
diff --git a/tensorflow/compiler/xla/client/lib/svd_test.cc b/tensorflow/compiler/xla/client/lib/svd_test.cc
index a987f7fcaf6..a39238548fc 100644
--- a/tensorflow/compiler/xla/client/lib/svd_test.cc
+++ b/tensorflow/compiler/xla/client/lib/svd_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/svd.h"
+
 #include <utility>
 
 #include "tensorflow/compiler/xla/array2d.h"
@@ -183,12 +184,14 @@ XLA_TEST_F(SVDTest, TestSingleValuesMatchNumpy) {
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_512x128) {
+// Too slow on the interpreter backend.
+XLA_TEST_F(SVDTest,
+           DISABLED_ON_INTERPRETER(Various_Size_Random_Matrix_512x128)) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(512, 128);
   XlaOp a;
   auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
-  auto result = SVD(a, 100, 1e-6);
+  auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
   ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
@@ -200,7 +203,7 @@ XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_128x256) {
   Array2D<float> a_val = GenerateRandomMatrix(128, 256);
   XlaOp a;
   auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
-  auto result = SVD(a, 100, 1e-6);
+  auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
   ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
@@ -212,38 +215,44 @@ XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_256x128) {
   Array2D<float> a_val = GenerateRandomMatrix(256, 128);
   XlaOp a;
   auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
-  auto result = SVD(a, 100, 1e-6);
+  auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
   ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_128x512) {
+// Too slow on the interpreter backend.
+XLA_TEST_F(SVDTest,
+           DISABLED_ON_INTERPRETER(Various_Size_Random_Matrix_128x512)) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(128, 512);
   XlaOp a;
   auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
-  auto result = SVD(a, 100, 1e-6);
+  auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
   ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_512x256) {
+// Too slow on the interpreter and CPU backends.
+XLA_TEST_F(SVDTest, DISABLED_ON_CPU(DISABLED_ON_INTERPRETER(
+                        Various_Size_Random_Matrix_512x256))) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(512, 256);
   XlaOp a;
   auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
-  auto result = SVD(a, 100, 1e-6);
+  auto result = SVD(a, 100, 1e-4);
   GetAverageAbsoluteError(ComputeMatmulUDVT(result, &builder), a, &builder);
 
   ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
                              ErrorSpec(1e-3, 1e-3));
 }
 
-XLA_TEST_F(SVDTest, Various_Size_Random_Matrix_512x512) {
+// Too slow on the CPU, GPU and interpreter backends.
+XLA_TEST_F(SVDTest, DISABLED_ON_GPU(DISABLED_ON_CPU(DISABLED_ON_INTERPRETER(
+                        Various_Size_Random_Matrix_512x512)))) {
   XlaBuilder builder(TestName());
   Array2D<float> a_val = GenerateRandomMatrix(512, 512);
   XlaOp a;
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 508f16a945f..b5fa1b6ced8 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -150,7 +150,7 @@ class XlaBuilder {
   // result, OpMetadata is set on the Computation Builder. All subsequent
   // instructions generated via this Computation Builder will have the same
   // OpMetadata attached until a call to ClearOpMetadata.
-  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
+  void SetOpMetadata(OpMetadata metadata) { metadata_ = std::move(metadata); }
 
   // Clears the HloMetadata state.
   void ClearOpMetadata() { metadata_.Clear(); }
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 39c90b60a09..1cfb449ebd0 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -15,8 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 
+#include <atomic>
+
+#include "absl/strings/str_cat.h"
+
 namespace xla {
 
+RunId::RunId() {
+  static std::atomic<int64> counter{0};
+  data_ = counter.fetch_add(1);
+}
+
+bool operator==(const RunId& a, const RunId& b) { return a.data_ == b.data_; }
+
+std::string RunId::ToString() const { return absl::StrCat("RunId: ", data_); }
+
 ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
     int device_ordinal) {
   device_ordinal_ = device_ordinal;
@@ -94,4 +107,11 @@ ExecutableRunOptions& ExecutableRunOptions::set_rng_seed(int rng_seed) {
 
 int ExecutableRunOptions::rng_seed() const { return rng_seed_; }
 
+ExecutableRunOptions& ExecutableRunOptions::set_run_id(RunId id) {
+  run_id_ = id;
+  return *this;
+}
+
+RunId ExecutableRunOptions::run_id() const { return run_id_; }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 84629593953..4de8148451b 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
+#include <string>
+
+#include "tensorflow/compiler/xla/types.h"
+
 // These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -35,6 +39,31 @@ namespace xla {
 class DeviceAssignment;
 class ExecutionProfile;
 
+// A unique identifier for a particular "logical execution" of an XLA model.
+//
+// A logical execution might encompass multiple executions of one or more
+// HloModules.  Runs that are part of the same logical execution can
+// communicate via collective ops (e.g. kAllToAll), whereas runs that are part
+// of different logical executions are isolated.
+class RunId {
+ public:
+  // Creates a new, unique RunId.
+  RunId();
+
+  RunId(const RunId&) = default;
+  RunId& operator=(const RunId&) = default;
+  friend bool operator==(const RunId& a, const RunId& b);
+  std::string ToString() const;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const RunId& id) {
+    return H::combine(std::move(h), id.data_);
+  }
+
+ private:
+  int64 data_;
+};
+
 // Class containing options for running a LocalExecutable.
 class ExecutableRunOptions {
  public:
@@ -87,6 +116,9 @@ class ExecutableRunOptions {
   ExecutableRunOptions& set_rng_seed(int rng_seed);
   int rng_seed() const;
 
+  ExecutableRunOptions& set_run_id(RunId id);
+  RunId run_id() const;
+
  private:
   stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -96,6 +128,7 @@ class ExecutableRunOptions {
   ExecutionProfile* execution_profile_ = nullptr;
   int rng_seed_ = 0;
   stream_executor::Stream* host_to_device_stream_ = nullptr;
+  RunId run_id_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/BUILD b/tensorflow/compiler/xla/experimental/xla_sharding/BUILD
index a26b20c8618..57eeb25bb49 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/BUILD
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Python API for shardings in XLA.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 py_library(
     name = "xla_sharding",
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index eebd8245abe..463a8d95fc5 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -119,6 +119,8 @@ namespace xla {
     int64 limit = shape.dimensions(dimno);
     if (indices[dimno] + 1 < limit) {
       indices[dimno]++;
+      // Whenever an index of a dimension is increased, it means that all
+      // following dimensions have maxed out, so they must go to 0.
       std::fill(indices.begin() + dimno + 1, indices.end(), 0);
       return true;
     }
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index c810ae9cbae..3c53592d040 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 0431bb3d54a..dc11f7caa2c 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -662,8 +662,11 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) {
     case PRED:
       result = Equal<bool>(expected, actual, index, 0);
       break;
-    case U8:
-      result = Equal<uint8>(expected, actual, index, 0);
+    case S8:
+      result = Equal<int8>(expected, actual, index, 0);
+      break;
+    case S16:
+      result = Equal<int16>(expected, actual, index, 0);
       break;
     case S32:
       result = Equal<int32>(expected, actual, index, 0);
@@ -671,6 +674,12 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) {
     case S64:
       result = Equal<int64>(expected, actual, index, 0);
       break;
+    case U8:
+      result = Equal<uint8>(expected, actual, index, 0);
+      break;
+    case U16:
+      result = Equal<uint16>(expected, actual, index, 0);
+      break;
     case U32:
       result = Equal<uint32>(expected, actual, index, 0);
       break;
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 45a3a264fd6..49f41d232a2 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_python_default_plugins")
@@ -145,7 +146,6 @@ cc_library(
         ":shared_device_buffer",
         ":types",
         ":worker_thread",
-        "//tensorflow/compiler/jit:xla_launch_util",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -166,6 +166,7 @@ cc_library(
         "//tensorflow/core:gpu_mem_allocator",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:tf_allocator_adapter",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc
index facc61d515d..e13637c2fd9 100644
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/python/local_client.cc
@@ -71,7 +71,6 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "include/pybind11/pybind11.h"
-#include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
@@ -88,6 +87,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_mem_allocator.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
 
 namespace xla {
 
@@ -162,10 +162,25 @@ Device::Device(se::StreamExecutor* executor, bool use_multiple_streams,
 }
 
 Device::~Device() {
+  Status status = SynchronizeAllActivity();
+  if (!status.ok()) {
+    LOG(ERROR) << "Error when closing device: " << status;
+  }
+}
+
+Status Device::SynchronizeAllActivity() {
+  Status status;
+  // TODO(phawkins): in theory the call to SynchronizeAllActivity below should
+  // suffice. However on the Host platform SynchronizeAllActivity is a dummy
+  // implementation that doesn't actually block. To make sure activity has
+  // stopped, also block on the compute stream. If SynchronizeAllActivity is
+  // fixed, we could remove the BlockHostUntilDone call.
+  status.Update(compute_stream_->BlockHostUntilDone());
   bool ok = compute_stream_->parent()->SynchronizeAllActivity();
   if (!ok) {
-    LOG(ERROR) << "SynchronizeAllActivity failed when destroying Device.";
+    status.Update(Unknown("SynchronizeAllActivity failed."));
   }
+  return status;
 }
 
 void Device::ThenExecuteOnWorkerThread(se::Stream* stream,
@@ -174,18 +189,17 @@ void Device::ThenExecuteOnWorkerThread(se::Stream* stream,
       [this, callback]() { worker_thread_->Schedule(std::move(callback)); });
 }
 
-static StatusOr<std::unique_ptr<tensorflow::MultiDeviceAdapter>>
-CreateBFCAllocator(se::Platform* platform, LocalClient* client,
-                   double memory_fraction) {
+static StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
+    se::Platform* platform, LocalClient* client, double memory_fraction) {
   CHECK_GT(client->backend().device_count(), 0);
   std::vector<std::unique_ptr<tensorflow::Allocator>> allocators;
   for (se::StreamExecutor* executor : client->backend().stream_executors()) {
     int device_ordinal = executor->device_ordinal();
-    tensorflow::GPUMemAllocator* sub_allocator =
-        new tensorflow::GPUMemAllocator(
-            executor, tensorflow::PlatformGpuId(device_ordinal),
-            /*use_unified_memory=*/false, /*alloc_visitors=*/{},
-            /*free_visitors=*/{});
+    auto sub_allocator = absl::make_unique<tensorflow::GPUMemAllocator>(
+        executor, tensorflow::PlatformGpuId(device_ordinal),
+        /*use_unified_memory=*/false,
+        /*alloc_visitors=*/std::vector<tensorflow::SubAllocator::Visitor>(),
+        /*free_visitors=*/std::vector<tensorflow::SubAllocator::Visitor>());
 
     int64 free_memory;
     int64 total_memory;
@@ -198,13 +212,13 @@ CreateBFCAllocator(se::Platform* platform, LocalClient* client,
               << total_memory << " bytes on device " << device_ordinal
               << " for BFCAllocator.";
 
-    tensorflow::BFCAllocator* gpu_bfc_allocator = new tensorflow::BFCAllocator(
-        sub_allocator, allocator_memory, /*allow_growth=*/false,
+    auto gpu_bfc_allocator = absl::make_unique<tensorflow::BFCAllocator>(
+        sub_allocator.release(), allocator_memory, /*allow_growth=*/false,
         absl::StrCat("GPU_", device_ordinal, "_bfc"));
-    allocators.emplace_back(gpu_bfc_allocator);
+    allocators.emplace_back(std::move(gpu_bfc_allocator));
   }
-  return absl::make_unique<tensorflow::MultiDeviceAdapter>(
-      platform, std::move(allocators));
+  return absl::make_unique<se::MultiDeviceAdapter>(platform,
+                                                   std::move(allocators));
 }
 
 StatusOr<std::shared_ptr<PyLocalClient>> PyLocalClient::Get(
@@ -250,8 +264,7 @@ PyLocalClient::PyLocalClient(
     allocator_ = client_->backend().memory_allocator();
   }
   devices_.reserve(client->device_count());
-  // TODO(phawkins): enable multistream mode on GPU too.
-  bool use_multiple_streams = (platform_name == "tpu");
+  bool use_multiple_streams = (platform_name_ != "cpu");
   bool synchronous_deallocation = !use_multiple_streams;
   for (int i = 0; i < client->device_count(); ++i) {
     se::StreamExecutor* executor =
@@ -281,7 +294,7 @@ StatusOr<pybind11::object> PyLocalClient::TransferFromOutfeed(
   return LiteralToPython(absl::make_unique<Literal>(std::move(literal)));
 }
 
-static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
+static StatusOr<std::unique_ptr<PyLocalBuffer>> TransferHostToDeviceAsync(
     const PythonBufferTree& tree, int device_ordinal,
     std::shared_ptr<PyLocalClient> client, const Device& device) {
   se::DeviceMemoryAllocator* allocator = client->allocator();
@@ -315,8 +328,9 @@ static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
   }
   std::shared_ptr<BufferDefinitionEvent> definition_event;
   if (device.use_multiple_streams()) {
-    definition_event = std::make_shared<BufferDefinitionEvent>(
-        device.host_to_device_stream()->parent());
+    TF_ASSIGN_OR_RETURN(definition_event,
+                        BufferDefinitionEvent::Create(
+                            device.host_to_device_stream()->parent()));
     definition_event->RecordOnStream(device.host_to_device_stream());
   }
   std::shared_ptr<PySharedDeviceBuffer> device_buffer =
@@ -326,11 +340,12 @@ static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
     device.ThenReleaseOnWorkerThread(device.host_to_device_stream(),
                                      device_buffer);
   }
-  return PyLocalBuffer(shape, std::move(device_buffer), std::move(client));
+  return absl::make_unique<PyLocalBuffer>(shape, std::move(device_buffer),
+                                          std::move(client));
 }
 
 /* static */
-StatusOr<PyLocalBuffer> PyLocalBuffer::FromPython(
+StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromPython(
     const py::object& argument, std::shared_ptr<PyLocalClient> client,
     int device_ordinal) {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromPython");
@@ -349,7 +364,7 @@ StatusOr<PyLocalBuffer> PyLocalBuffer::FromPython(
           << " device ordinal: " << device_ordinal;
 
   const Device& device = client->device(device_ordinal);
-  TF_ASSIGN_OR_RETURN(PyLocalBuffer buffer,
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PyLocalBuffer> buffer,
                       TransferHostToDeviceAsync(tree, device_ordinal,
                                                 std::move(client), device));
 
@@ -357,20 +372,20 @@ StatusOr<PyLocalBuffer> PyLocalBuffer::FromPython(
   return buffer;
 }
 
-/*static */ StatusOr<std::vector<PyLocalBuffer>>
+/*static */ StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
 PyLocalBuffer::FromPythonValues(
     const std::vector<std::pair<py::object, int>>& arguments,
     std::shared_ptr<PyLocalClient> client) {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromPythonValues");
   int num_arguments = static_cast<int>(arguments.size());
-  std::vector<PyLocalBuffer> outputs(num_arguments);
+  std::vector<std::unique_ptr<PyLocalBuffer>> outputs(num_arguments);
   if (num_arguments == 0) {
     return outputs;
   }
 
   struct H2DTransfer {
     PythonBufferTree tree;
-    StatusOr<PyLocalBuffer> buffer;
+    StatusOr<std::unique_ptr<PyLocalBuffer>> buffer;
     PythonRefManager::ManagedPyObjects py_buffer_refs;
   };
 
@@ -385,7 +400,7 @@ PyLocalBuffer::FromPythonValues(
   // We are done manipulating Python objects; release the GIL.
   py::gil_scoped_release gil_release;
 
-  auto transfer_h2d = [&](int i) -> StatusOr<PyLocalBuffer> {
+  auto transfer_h2d = [&](int i) -> StatusOr<std::unique_ptr<PyLocalBuffer>> {
     int device_ordinal = arguments[i].second;
     return TransferHostToDeviceAsync(transfers[i].tree, device_ordinal, client,
                                      client->device(device_ordinal));
@@ -420,18 +435,24 @@ PyLocalBuffer::FromPythonValues(
   return outputs;
 }
 
-/* static */ StatusOr<PyLocalBuffer> PyLocalBuffer::MakeTuple(
-    const std::vector<PyLocalBuffer> buffers,
+/* static */ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::MakeTuple(
+    const std::vector<PyLocalBuffer*> buffers,
     std::shared_ptr<PyLocalClient> client, int device_ordinal) {
   std::vector<xla::Shape> host_shapes;
   std::vector<std::shared_ptr<PySharedDeviceBuffer>> device_buffers;
   host_shapes.reserve(buffers.size());
   device_buffers.reserve(buffers.size());
-  for (const PyLocalBuffer& buffer : buffers) {
-    TF_RET_CHECK(buffer.device_buffer()->device_memory().device_ordinal() ==
-                 device_ordinal);
-    host_shapes.push_back(buffer.on_host_shape());
-    device_buffers.push_back(buffer.device_buffer());
+  for (const PyLocalBuffer* buffer : buffers) {
+    TF_RET_CHECK(buffer->device_ordinal() == device_ordinal);
+    std::shared_ptr<PySharedDeviceBuffer> device_buffer =
+        buffer->DeviceBuffer();
+    if (!device_buffer) {
+      return InvalidArgument(
+          "Invalid buffer passed to MakeTuple() as argument %d.",
+          device_buffers.size());
+    }
+    host_shapes.push_back(buffer->on_host_shape());
+    device_buffers.push_back(std::move(device_buffer));
   }
   se::DeviceMemoryAllocator* allocator = client->allocator();
   TransferManager* transfer_manager =
@@ -439,19 +460,20 @@ PyLocalBuffer::FromPythonValues(
   const Device& device = client->device(device_ordinal);
   std::shared_ptr<BufferDefinitionEvent> definition_event;
   if (device.use_multiple_streams()) {
-    definition_event = std::make_shared<BufferDefinitionEvent>(
-        device.host_to_device_stream()->parent());
+    TF_ASSIGN_OR_RETURN(definition_event,
+                        BufferDefinitionEvent::Create(
+                            device.host_to_device_stream()->parent()));
   }
   TF_ASSIGN_OR_RETURN(std::shared_ptr<PySharedDeviceBuffer> tuple_buffer,
                       PySharedDeviceBuffer::MakeTuple(
                           device_buffers, transfer_manager, allocator,
                           device_ordinal, definition_event));
-  PyLocalBuffer buffer(ShapeUtil::MakeTupleShape(host_shapes), tuple_buffer,
-                       std::move(client));
+  auto buffer = absl::make_unique<PyLocalBuffer>(
+      ShapeUtil::MakeTupleShape(host_shapes), tuple_buffer, std::move(client));
 
   // TODO(phawkins): extend TransferManager so we do not need to form a full
   // ShapedBuffer just to write the root tuple index table.
-  ShapedBuffer shaped_buffer = buffer.AsShapedBuffer();
+  TF_ASSIGN_OR_RETURN(ShapedBuffer shaped_buffer, buffer->AsShapedBuffer());
   if (device.use_multiple_streams() &&
       !transfer_manager->CanShapedBufferBeAccessedNow(
           device.host_to_device_stream()->parent(), shaped_buffer)) {
@@ -476,21 +498,33 @@ PyLocalBuffer::PyLocalBuffer(
     std::shared_ptr<PyLocalClient> client)
     : client_(std::move(client)),
       on_host_shape_(std::move(on_host_shape)),
+      device_ordinal_(device_buffer->device_ordinal()),
       device_buffer_(std::move(device_buffer)) {}
 
+void PyLocalBuffer::Delete() {
+  absl::MutexLock lock(&mu_);
+  device_buffer_ = nullptr;
+}
+
 StatusOr<py::object> PyLocalBuffer::ToPython() const {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::ToPython");
-  auto literal = absl::make_unique<Literal>(on_host_shape());
+  std::shared_ptr<PySharedDeviceBuffer> device_buffer = DeviceBuffer();
+  if (!device_buffer) {
+    return InvalidArgument("ToPython() called on invalid buffer.");
+  }
+
+  auto literal = absl::make_unique<Literal>(on_host_shape_);
   client_->py_ref_manager().CollectGarbage();
   {
     py::gil_scoped_release gil_release;
-    se::Stream* stream = client_->device(device_buffer_->device_ordinal())
+    se::Stream* stream = client_->device(device_buffer->device_ordinal())
                              .device_to_host_stream();
-    WaitForBufferDefinitionEventsOnStream(*device_buffer_, stream);
+    WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
     absl::Notification done;
     Status status;
+    TF_ASSIGN_OR_RETURN(ShapedBuffer shaped_buffer, AsShapedBuffer());
     client_->client()->backend().transfer_manager()->TransferLiteralFromDevice(
-        stream, AsShapedBuffer(), *literal, [&](Status done_status) {
+        stream, shaped_buffer, *literal, [&](Status done_status) {
           status = done_status;
           done.Notify();
         });
@@ -499,28 +533,64 @@ StatusOr<py::object> PyLocalBuffer::ToPython() const {
   return LiteralToPython(std::move(literal));
 }
 
-ShapedBuffer PyLocalBuffer::AsShapedBuffer() const {
+std::shared_ptr<PySharedDeviceBuffer> PyLocalBuffer::DeviceBuffer() const {
+  absl::MutexLock lock(&mu_);
+  return device_buffer_;
+}
+
+StatusOr<ShapedBuffer> PyLocalBuffer::AsShapedBuffer() const {
+  absl::MutexLock lock(&mu_);
+  if (!device_buffer_) {
+    return InvalidArgument(
+        "Attempted to fetch value of invalid/deleted buffer.");
+  }
   return device_buffer_->AsShapedBuffer(on_host_shape_);
 }
 
-StatusOr<std::vector<PyLocalBuffer>> PyLocalBuffer::DestructureTuple() {
+StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
+PyLocalBuffer::DestructureTuple() {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::DestructureTuple");
-  if (!on_host_shape().IsTuple()) {
+  absl::MutexLock lock(&mu_);
+  if (!on_host_shape_.IsTuple()) {
     return InvalidArgument(
         "Attemped to destructure a PyLocalBuffer that did not have a tuple "
         "shape; shape: %s",
-        ShapeUtil::HumanString(on_host_shape()));
+        ShapeUtil::HumanString(on_host_shape_));
   }
-  int num_children = ShapeUtil::TupleElementCount(on_host_shape());
-  std::vector<PyLocalBuffer> results;
+  if (!device_buffer_) {
+    return InvalidArgument("Attempted to destructure a deleted buffer.");
+  }
+  int num_children = ShapeUtil::TupleElementCount(on_host_shape_);
+  std::vector<std::unique_ptr<PyLocalBuffer>> results;
   results.reserve(num_children);
   for (int64 i = 0; i < num_children; ++i) {
-    results.push_back(PyLocalBuffer(on_host_shape().tuple_shapes(i),
-                                    device_buffer_->children().at(i), client_));
+    results.push_back(absl::make_unique<PyLocalBuffer>(
+        on_host_shape_.tuple_shapes(i), device_buffer_->children().at(i),
+        client_));
   }
   return results;
 }
 
+Status PyLocalBuffer::BlockHostUntilReady() {
+  tensorflow::profiler::TraceMe traceme("PyLocalBuffer::BlockHostUntilReady");
+  std::shared_ptr<PySharedDeviceBuffer> device_buffer = DeviceBuffer();
+  if (!device_buffer) {
+    return InvalidArgument("BlockHostUntilReady() called on invalid buffer.");
+  }
+
+  client_->py_ref_manager().CollectGarbage();
+  py::gil_scoped_release gil_release;
+
+  // This code waits at least until the buffer is ready, but it may wait longer
+  // if there are other device to host transfers scheduled. If this proves to
+  // be an issue, we could either use a separate stream for this purpose, or
+  // poll for the buffer definition events.
+  se::Stream* stream =
+      client_->device(device_buffer->device_ordinal()).device_to_host_stream();
+  WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
+  return stream->BlockHostUntilDone();
+}
+
 PyLocalExecutable::PyLocalExecutable(
     std::shared_ptr<LocalExecutable> executable,
     DeviceAssignment device_assignment, std::shared_ptr<PyLocalClient> client)
@@ -538,7 +608,7 @@ std::vector<int> PyLocalExecutable::DeviceOrdinals() const {
   return device_ordinals;
 }
 
-StatusOr<PyLocalBuffer> PyLocalExecutable::ExecuteHelper(
+StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
     absl::Span<PyLocalBuffer* const> argument_handles, int replica) {
   const int device_ordinal = device_assignment_(replica, 0);
   tensorflow::profiler::TraceMe traceme("LocalExecutable::Execute");
@@ -546,28 +616,34 @@ StatusOr<PyLocalBuffer> PyLocalExecutable::ExecuteHelper(
           << " mapped to device ordinal for execution: " << device_ordinal;
 
   absl::flat_hash_set<BufferDefinitionEvent*> events;
+  std::vector<std::shared_ptr<PySharedDeviceBuffer>> device_buffers;
   std::vector<ShapedBuffer> argument_buffers;
   std::vector<const ShapedBuffer*> argument_buffer_ptrs;
+  device_buffers.reserve(argument_handles.size() + 1);
   argument_buffers.reserve(argument_handles.size());
   argument_buffer_ptrs.reserve(argument_handles.size());
-  for (auto& handle : argument_handles) {
-    if (handle->device_buffer() == nullptr) {
+  for (int i = 0; i < argument_handles.size(); ++i) {
+    PyLocalBuffer* handle = argument_handles[i];
+    std::shared_ptr<PySharedDeviceBuffer> device_buffer =
+        handle->DeviceBuffer();
+    if (!device_buffer) {
       return InvalidArgument(
           "Deleted buffer passed to Execute() as argument "
           "%d to replica %d",
-          argument_buffers.size(), replica);
+          i, replica);
     }
-    if (handle->device_buffer()->device_ordinal() != device_ordinal) {
+    if (device_buffer->device_ordinal() != device_ordinal) {
       return InvalidArgument(
           "Buffer passed to Execute() as argument %d to replica %d is on "
           "device %d, but replica is assigned to device %d.",
-          argument_buffers.size(), replica,
-          handle->device_buffer()->device_ordinal(), device_ordinal);
+          i, replica, device_buffer->device_ordinal(), device_ordinal);
     }
-    argument_buffers.push_back(handle->AsShapedBuffer());
+    TF_ASSIGN_OR_RETURN(ShapedBuffer shaped_buffer, handle->AsShapedBuffer());
+    argument_buffers.push_back(std::move(shaped_buffer));
     argument_buffer_ptrs.push_back(&argument_buffers.back());
-    GetDeviceBufferDefinitionEvents(*handle->device_buffer(), &events);
-    VLOG(4) << "Argument " << argument_buffers.size() - 1
+    GetDeviceBufferDefinitionEvents(*device_buffer, &events);
+    device_buffers.push_back(std::move(device_buffer));
+    VLOG(4) << "Argument " << i
             << " buffer: " << argument_buffers.back().ToString();
   }
 
@@ -603,8 +679,9 @@ StatusOr<PyLocalBuffer> PyLocalExecutable::ExecuteHelper(
 
   std::shared_ptr<BufferDefinitionEvent> definition_event;
   if (device.use_multiple_streams()) {
-    definition_event = std::make_shared<BufferDefinitionEvent>(
-        device.compute_stream()->parent());
+    TF_ASSIGN_OR_RETURN(
+        definition_event,
+        BufferDefinitionEvent::Create(device.compute_stream()->parent()));
     definition_event->RecordOnStream(device.compute_stream());
   }
   Shape on_host_shape = result_buffer.ValueOrDie().on_host_shape();
@@ -613,20 +690,16 @@ StatusOr<PyLocalBuffer> PyLocalExecutable::ExecuteHelper(
           std::move(result_buffer.ValueOrDie()), definition_event);
 
   if (device.synchronous_deallocation()) {
-    std::vector<std::shared_ptr<PySharedDeviceBuffer>> buffers;
-    buffers.reserve(argument_handles.size() + 1);
-    for (auto& handle : argument_handles) {
-      buffers.push_back(handle->device_buffer());
-    }
-    buffers.push_back(out_buffer);
+    device_buffers.push_back(out_buffer);
     device.ThenReleaseOnWorkerThread(device.compute_stream(),
-                                     std::move(buffers));
-    device.ThenReleaseOnWorkerThread(device.compute_stream(), executable_);
+                                     std::move(device_buffers));
   }
-  return PyLocalBuffer(on_host_shape, std::move(out_buffer), client_);
+  device.ThenReleaseOnWorkerThread(device.compute_stream(), executable_);
+  return absl::make_unique<PyLocalBuffer>(on_host_shape, std::move(out_buffer),
+                                          client_);
 }
 
-StatusOr<PyLocalBuffer> PyLocalExecutable::Execute(
+StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::Execute(
     absl::Span<PyLocalBuffer* const> argument_handles) {
   if (num_replicas() != 1) {
     return InvalidArgument(
@@ -636,7 +709,8 @@ StatusOr<PyLocalBuffer> PyLocalExecutable::Execute(
   return ExecuteHelper(argument_handles, /*replica=*/0);
 }
 
-StatusOr<std::vector<PyLocalBuffer>> PyLocalExecutable::ExecutePerReplica(
+StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
+PyLocalExecutable::ExecutePerReplica(
     absl::Span<const std::vector<PyLocalBuffer*>> argument_handles) {
   tensorflow::profiler::TraceMe traceme("LocalExecutable::ExecutePerReplica");
   const int num_devices = client_->device_count();
@@ -654,7 +728,7 @@ StatusOr<std::vector<PyLocalBuffer>> PyLocalExecutable::ExecutePerReplica(
 
   VLOG(1) << "Executing replicated computation; num_replicas="
           << num_replicas();
-  std::vector<StatusOr<PyLocalBuffer>> results(num_replicas());
+  std::vector<StatusOr<std::unique_ptr<PyLocalBuffer>>> results(num_replicas());
   if (num_replicas() == 1) {
     // Fast-path if there is only one replica — run the computation on the
     // current thread.
@@ -710,7 +784,7 @@ StatusOr<std::vector<PyLocalBuffer>> PyLocalExecutable::ExecutePerReplica(
   }
   VLOG(1) << "Replicated execution complete.";
 
-  std::vector<PyLocalBuffer> wrapped_results(num_replicas());
+  std::vector<std::unique_ptr<PyLocalBuffer>> wrapped_results(num_replicas());
   for (int replica = 0; replica < num_replicas(); ++replica) {
     auto& statusor = results[replica];
     if (!statusor.ok()) {
@@ -728,12 +802,45 @@ StatusOr<std::vector<PyLocalBuffer>> PyLocalExecutable::ExecutePerReplica(
 
 /*static*/ StatusOr<std::unique_ptr<PyLocalExecutable>>
 PyLocalExecutable::Compile(const XlaComputation& computation,
-                           std::vector<Shape> argument_layouts,
+                           absl::optional<std::vector<Shape>> argument_layouts,
                            const ExecutableBuildOptions* build_options,
-                           std::shared_ptr<PyLocalClient> client) {
+                           std::shared_ptr<PyLocalClient> client,
+                           absl::optional<DeviceAssignment> device_assignment) {
   tensorflow::profiler::TraceMe traceme("LocalExecutable::Compile");
+
+  ExecutableBuildOptions options;
+  if (build_options != nullptr) {
+    options = *build_options;
+  }
+
+  if (device_assignment) {
+    if (device_assignment->replica_count() != options.num_replicas()) {
+      return InvalidArgument(
+          "Mismatched number of replicas for device "
+          "assignment and computation (%d vs %d).",
+          device_assignment->replica_count(), options.num_replicas());
+    } else if (device_assignment->computation_count() != 1) {
+      return Unimplemented(
+          "Only 1 computation per replica supported, %d requested.",
+          device_assignment->computation_count());
+    }
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        device_assignment,
+        client->client()->backend().computation_placer()->AssignDevices(
+            options.num_replicas(), /*computation_count=*/1));
+  }
+
+  if (!argument_layouts) {
+    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                        computation.GetProgramShape());
+    argument_layouts = program_shape.parameters();
+    for (Shape& shape : *argument_layouts) {
+      LayoutUtil::ClearLayout(&shape);
+    }
+  }
   std::vector<const Shape*> argument_layout_pointers;
-  argument_layout_pointers.reserve(argument_layouts.size());
+  argument_layout_pointers.reserve(argument_layouts->size());
 
   // Assign a default layout to any array subshapes that are missing layouts.
   auto assign_layouts = [client](Shape* shape) {
@@ -751,16 +858,11 @@ PyLocalExecutable::Compile(const XlaComputation& computation,
         });
   };
 
-  for (Shape& layout : argument_layouts) {
+  for (Shape& layout : *argument_layouts) {
     argument_layout_pointers.push_back(&layout);
     TF_RETURN_IF_ERROR(assign_layouts(&layout));
   }
 
-  ExecutableBuildOptions options;
-  if (build_options != nullptr) {
-    options = *build_options;
-  }
-
   Shape result_layout;
   if (options.result_layout()) {
     result_layout = *options.result_layout();
@@ -776,14 +878,10 @@ PyLocalExecutable::Compile(const XlaComputation& computation,
   TF_ASSIGN_OR_RETURN(std::unique_ptr<LocalExecutable> local_executable,
                       client->client()->Compile(
                           computation, argument_layout_pointers, options));
-  TF_ASSIGN_OR_RETURN(
-      DeviceAssignment device_assignment,
-      client->client()->backend().computation_placer()->AssignDevices(
-          options.num_replicas(), /*computation_count=*/1));
 
   return absl::make_unique<PyLocalExecutable>(
       std::shared_ptr<LocalExecutable>(std::move(local_executable)),
-      std::move(device_assignment), std::move(client));
+      std::move(*device_assignment), std::move(client));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/python/local_client.h
index 1ad0f933007..e70567ff6b0 100644
--- a/tensorflow/compiler/xla/python/local_client.h
+++ b/tensorflow/compiler/xla/python/local_client.h
@@ -169,6 +169,8 @@ class Device {
   }
 
  private:
+  Status SynchronizeAllActivity();
+
   bool use_multiple_streams_;
   bool synchronous_deallocation_;
   bool asynchronous_;
@@ -242,49 +244,67 @@ class PyLocalClient {
 };
 
 // Holds a reference from Python to one or more device buffers.
+// A PyLocalBuffer can be either valid or invalid. An invalid buffer is one that
+// has never been initialized, or a buffer that has been deleted (e.g., by
+// calling Delete). We allow PyLocalBuffer objects to outlive the underlying
+// device buffers so we can decouple buffer lifetimes from the corresponding
+// Python references if needed.
+// Thread-safe.
 class PyLocalBuffer {
  public:
-  static StatusOr<PyLocalBuffer> FromPython(
+  static StatusOr<std::unique_ptr<PyLocalBuffer>> FromPython(
       const pybind11::object& argument, std::shared_ptr<PyLocalClient> client,
       int device_ordinal);
 
   // Converts multiple (python object, device ordinal) pairs into
   // PyLocalBuffers in parallel.
-  static StatusOr<std::vector<PyLocalBuffer>> FromPythonValues(
+  static StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> FromPythonValues(
       const std::vector<std::pair<pybind11::object, int>>& argument,
       std::shared_ptr<PyLocalClient> client);
 
-  static StatusOr<PyLocalBuffer> MakeTuple(
-      const std::vector<PyLocalBuffer> buffers,
+  static StatusOr<std::unique_ptr<PyLocalBuffer>> MakeTuple(
+      const std::vector<PyLocalBuffer*> buffers,
       std::shared_ptr<PyLocalClient> client, int device_ordinal);
 
   PyLocalBuffer() = default;
   PyLocalBuffer(Shape on_host_shape,
                 std::shared_ptr<PySharedDeviceBuffer> device_buffer,
                 std::shared_ptr<PyLocalClient> client);
+
+  PyLocalBuffer(const PyLocalBuffer&) = delete;
+  PyLocalBuffer(PyLocalBuffer&&) = delete;
+  PyLocalBuffer& operator=(const PyLocalBuffer&) = delete;
+  PyLocalBuffer& operator=(PyLocalBuffer&&) = delete;
+
   StatusOr<pybind11::object> ToPython() const;
   const Shape& on_host_shape() const { return on_host_shape_; }
-  const std::shared_ptr<PySharedDeviceBuffer>& device_buffer() const {
-    return device_buffer_;
-  }
-  int device_ordinal() const { return device_buffer_->device_ordinal(); }
+  int device_ordinal() const { return device_ordinal_; }
 
-  void Delete() {
-    device_buffer_ = nullptr;
-    client_ = nullptr;
-  }
+  // Returns the associated device buffer. Returns a nullptr if the buffer is
+  // invalid.
+  std::shared_ptr<PySharedDeviceBuffer> DeviceBuffer() const;
+
+  // Deletes the device memory associated with this buffer, leaving it in an
+  // invalid state.
+  void Delete();
 
   // Returns a view of the PyLocalBuffer DAG as a ShapedBuffer. The
   // PyLocalBuffer retains ownership of the device buffers.
-  ShapedBuffer AsShapedBuffer() const;
+  StatusOr<ShapedBuffer> AsShapedBuffer() const;
 
   // Destructures a tuple-valued PyLocalBuffer into its constituent elements.
-  StatusOr<std::vector<PyLocalBuffer>> DestructureTuple();
+  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> DestructureTuple();
+
+  // Blocks the host until the buffer's value has been computed and is ready for
+  // immediate use on the device. Useful in particular for timing benchmarks.
+  Status BlockHostUntilReady();
 
  private:
-  std::shared_ptr<PyLocalClient> client_ = nullptr;
-  Shape on_host_shape_;
-  std::shared_ptr<PySharedDeviceBuffer> device_buffer_;
+  const std::shared_ptr<PyLocalClient> client_;
+  const Shape on_host_shape_;
+  const int device_ordinal_;
+  mutable absl::Mutex mu_;
+  std::shared_ptr<PySharedDeviceBuffer> device_buffer_ GUARDED_BY(mu_);
 };
 
 // Represents a compiled computation that can be executed given handles to
@@ -293,9 +313,11 @@ class PyLocalExecutable {
  public:
   // Compiles a computation to an executable.
   static StatusOr<std::unique_ptr<PyLocalExecutable>> Compile(
-      const XlaComputation& computation, std::vector<Shape> argument_layouts,
+      const XlaComputation& computation,
+      absl::optional<std::vector<Shape>> argument_layouts,
       const ExecutableBuildOptions* build_options,
-      std::shared_ptr<PyLocalClient> client);
+      std::shared_ptr<PyLocalClient> client,
+      absl::optional<DeviceAssignment> device_assignment);
 
   PyLocalExecutable(std::shared_ptr<LocalExecutable> executable,
                     DeviceAssignment device_assignment,
@@ -312,19 +334,19 @@ class PyLocalExecutable {
     return device_assignment_;
   }
 
-  StatusOr<PyLocalBuffer> Execute(
+  StatusOr<std::unique_ptr<PyLocalBuffer>> Execute(
       absl::Span<PyLocalBuffer* const> argument_handles);
 
   // Execute on many replicas. Takes a sequence of argument lists (one argument
   // list per replica) and returns a tuple of results (one result per replica).
   // The number of argument lists must be equal to the replica count.
-  StatusOr<std::vector<PyLocalBuffer>> ExecutePerReplica(
+  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> ExecutePerReplica(
       absl::Span<const std::vector<PyLocalBuffer*>> argument_handles);
 
   void Delete() { executable_ = nullptr; }
 
  private:
-  StatusOr<PyLocalBuffer> ExecuteHelper(
+  StatusOr<std::unique_ptr<PyLocalBuffer>> ExecuteHelper(
       absl::Span<PyLocalBuffer* const> argument_handles, int replica);
 
   std::shared_ptr<PyLocalClient> const client_;
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.cc b/tensorflow/compiler/xla/python/shared_device_buffer.cc
index 8d7ce0088a4..23cf99f682e 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer.cc
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.cc
@@ -15,10 +15,20 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/shared_device_buffer.h"
 
+#include <memory>
+
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace xla {
 
+/*static*/ StatusOr<std::shared_ptr<BufferDefinitionEvent>>
+BufferDefinitionEvent::Create(se::StreamExecutor* executor) {
+  auto event = std::make_shared<BufferDefinitionEvent>(executor);
+  TF_RET_CHECK(event->event_.Init())
+      << "Buffer definition event initialization failed";
+  return event;
+}
+
 BufferDefinitionEvent::BufferDefinitionEvent(se::StreamExecutor* executor)
     : event_(executor) {}
 
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.h b/tensorflow/compiler/xla/python/shared_device_buffer.h
index 31cab5ade45..98f8e6a9e13 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer.h
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.h
@@ -51,6 +51,9 @@ namespace xla {
 class BufferDefinitionEvent {
  public:
   // Creates a new definition event whose event has not yet been triggered.
+  static StatusOr<std::shared_ptr<BufferDefinitionEvent>> Create(
+      se::StreamExecutor* executor);
+
   explicit BufferDefinitionEvent(se::StreamExecutor* executor);
 
   // Records the definition event on the tail of 'stream'.
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index a592b0823be..298a57d32ff 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -59,7 +59,7 @@ Uniquer* GetUniquer() {
   return uniquer;
 }
 
-static string UniquifyName(const string& name) {
+static std::string UniquifyName(const std::string& name) {
   Uniquer* uniquer = GetUniquer();
   absl::MutexLock lock(&uniquer->mu);
   return uniquer->name_uniquer.GetUniqueName(name);
@@ -246,7 +246,7 @@ PYBIND11_MODULE(xla_extension, m) {
 
   // Device assignments
   py::class_<DeviceAssignment>(m, "DeviceAssignment")
-      .def_static("Create",
+      .def_static("create",
                   [](py::array_t<int> array) -> StatusOr<DeviceAssignment> {
                     if (array.ndim() != 2) {
                       return InvalidArgument(
@@ -295,11 +295,12 @@ PYBIND11_MODULE(xla_extension, m) {
       .def_static("make_tuple", &PyLocalBuffer::MakeTuple)
       .def("delete", &PyLocalBuffer::Delete)
       .def("destructure", &PyLocalBuffer::DestructureTuple)
+      .def("block_host_until_ready", &PyLocalBuffer::BlockHostUntilReady)
       .def("to_py", &PyLocalBuffer::ToPython)
       .def("shape", &PyLocalBuffer::on_host_shape)
       .def("device", &PyLocalBuffer::device_ordinal)
       .def("is_deleted", [](const PyLocalBuffer& buffer) {
-        return buffer.device_buffer() == nullptr;
+        return buffer.DeviceBuffer() == nullptr;
       });
 
   py::class_<PyLocalExecutable>(m, "LocalExecutable")
@@ -441,10 +442,8 @@ PYBIND11_MODULE(xla_extension, m) {
   ops.def("Outfeed", &Outfeed, py::arg("operand"), py::arg("shape_with_layout"),
           py::arg("outfeed_config") = "");
   ops.def("Pad", &Pad);
-  ops.def(
-      "Parameter",
-      static_cast<XlaOp (*)(XlaBuilder*, int64, const Shape&, const string&)>(
-          &Parameter));
+  ops.def("Parameter", static_cast<XlaOp (*)(XlaBuilder*, int64, const Shape&,
+                                             const std::string&)>(&Parameter));
   ops.def("QR",
           [](XlaOp a, bool full_matrices) -> StatusOr<std::pair<XlaOp, XlaOp>> {
             TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, full_matrices));
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index e208cacc19c..4fde9e0da74 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -108,16 +108,14 @@ class LocalBackend(Backend):
   def compile(self, c_computation, compile_options):
     options = _xla.ExecutableBuildOptions()
     options.num_replicas = compile_options.num_replicas
-    if compile_options.argument_layouts:
-      argument_layouts = compile_options.argument_layouts
-    else:
-      argument_layouts = c_computation.GetProgramShape().parameter_shapes()
     if compile_options.result_layout:
       options.result_layout = compile_options.result_layout
     options.debug_options.xla_cpu_fast_math_honor_infs = True
     options.debug_options.xla_cpu_fast_math_honor_nans = True
-    return _xla.LocalExecutable.Compile(c_computation, argument_layouts,
-                                        options, self.client)
+    return _xla.LocalExecutable.Compile(c_computation,
+                                        compile_options.argument_layouts,
+                                        options, self.client,
+                                        compile_options.device_assignment)
 
 
 def _cpu_backend_factory():
@@ -145,7 +143,7 @@ def _gpu_backend_factory():
     config.memory_fraction = float(memory_fraction)
 
   client = _xla.LocalClient.Get(
-      platform='gpu', xla_platform_id='CUDA', asynchronous=False,
+      platform='gpu', xla_platform_id='CUDA', asynchronous=True,
       allocator_config=config)
   return LocalBackend(platform='gpu', client=client)
 
@@ -362,6 +360,8 @@ class Buffer(object):
   # def delete(self):
   # def destructure(self) -> [Buffer]
   # def is_deleted(self) -> bool:
+  # def block_host_until_ready(self):
+  #    """Blocks the calling thread until the buffer is ready on device."""
   #
   # TODO(phawkins): remove Buffer and its static methods completely, have
   # clients call methods on Backend to create buffers.
@@ -419,6 +419,27 @@ def transfer_from_outfeed(shape, device_ordinal=0):
       shape.with_major_to_minor_layout_if_absent(), device_ordinal)
 
 
+DeviceAssignment = _xla.DeviceAssignment
+DeviceAssignment.__doc__ = """
+A DeviceAssignment is a C++ object with the following signature.
+
+def create(assignment):
+  '''Builds a device assignment.
+
+   Args:
+     assignment: a 2D numpy array of device ordinal integers, indexed by
+       [replica][computation_in_replica].
+   Returns:
+     A device assignment.
+  '''
+
+def replica_count():
+  '''Returns the number of replicas.'''
+def computation_count():
+  '''Returns the number of computations per replica.'''
+"""
+
+
 class CompileOptions(object):
   """Python object for XLA compile options.
 
@@ -436,6 +457,7 @@ class CompileOptions(object):
     self.num_replicas = 1
     self.argument_layouts = None
     self.result_layout = None
+    self.device_assignment = None
 
 
 class Computation(object):
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 682a6c099a6..f553601a561 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -520,6 +520,13 @@ class BufferTest(ComputationTest):
     self.assertEqual(xla_shape.dimensions(), (1, 2))
     self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
 
+  def testBlockHostUntilReadyWorks(self):
+    arg = np.array([[1., 2.]], np.float32)
+    arg_buffer = xla_client.Buffer.from_pyval(arg)
+    arg_buffer.block_host_until_ready()
+    # This test merely checks that nothing goes awry when we call
+    # block_host_until_ready(); it's difficult to test anything else.
+
 
 class SingleOpTest(ComputationTest):
   """Tests for single ops.
diff --git a/tensorflow/compiler/xla/python/xrt.cc b/tensorflow/compiler/xla/python/xrt.cc
index 5292de8a079..147aafc356a 100644
--- a/tensorflow/compiler/xla/python/xrt.cc
+++ b/tensorflow/compiler/xla/python/xrt.cc
@@ -148,8 +148,14 @@ void AddXrtSubmodule(py::module* module) {
            })
       .def("delete", &XrtBuffer::Delete)
       .def("destructure", &XrtBuffer::DestructureTuple)
+      .def("device", &XrtBuffer::xrt_device_ordinal)
+      .def("shape", &XrtBuffer::shape)
       .def("is_deleted",
-           [](const XrtBuffer& buffer) { return !buffer.handle().valid(); });
+           [](const XrtBuffer& buffer) { return !buffer.handle().valid(); })
+      .def("block_host_until_ready", [](const XrtBuffer& buffer) {
+        return errors::Unimplemented(
+            "block_host_until_ready not implemented in XRT backend.");
+      });
 
   py::class_<XrtExecutable, std::shared_ptr<XrtExecutable>>(m, "XrtExecutable")
       .def_static("Compile",
diff --git a/tensorflow/compiler/xla/python/xrt.py b/tensorflow/compiler/xla/python/xrt.py
index 76a99f20481..40dea45e442 100644
--- a/tensorflow/compiler/xla/python/xrt.py
+++ b/tensorflow/compiler/xla/python/xrt.py
@@ -65,7 +65,7 @@ class XrtBackend(xla_client.Backend):
     return _xla.xrt.XrtBuffer.from_literal(self.context, device, pyval)
 
   def make_tuple(self, buffers, device_ordinal):
-    return _xla.xrt.XrtBuffer.make_tuple(self.context, buffers)
+    return _xla.xrt.XrtBuffer.make_tuple(self.context, buffers, device_ordinal)
 
   def compile(self, computation, compile_options):
     # pylint: disable=protected-access
diff --git a/tensorflow/compiler/xla/python_api/BUILD b/tensorflow/compiler/xla/python_api/BUILD
index d790c4db6c4..348a80abe2c 100644
--- a/tensorflow/compiler/xla/python_api/BUILD
+++ b/tensorflow/compiler/xla/python_api/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Python API for XLA.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 py_library(
     name = "types",
diff --git a/tensorflow/compiler/xla/refcounting_hash_map.h b/tensorflow/compiler/xla/refcounting_hash_map.h
new file mode 100644
index 00000000000..19b27d6fc3a
--- /dev/null
+++ b/tensorflow/compiler/xla/refcounting_hash_map.h
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_REFCOUNTING_HASH_MAP_H_
+#define TENSORFLOW_COMPILER_XLA_REFCOUNTING_HASH_MAP_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/container/node_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla {
+
+// RefcountingHashMap is an "eager, thread-safe cache".
+//
+// Given a key k you can retrieve a shared_ptr to a value v.  If k is not
+// already in the map, we construct a new V; if it is already in the map, we'll
+// return the existing v.  Once all shared_ptrs are destroyed, the entry is
+// removed from the map.
+//
+// This class is thread-safe.
+//
+// Word to the wise: You might want an erase() function here that removes a
+// value from the map but leaves existing shared_ptrs intact.  My experience is,
+// this is extremely complicated to implement correctly.
+template <typename K, typename V>
+class RefcountingHashMap {
+ public:
+  // Default-constructs new values.
+  RefcountingHashMap()
+      : value_factory_([](const K&) { return absl::make_unique<V>(); }) {}
+
+  // Constructs new values according to the given factory function.
+  explicit RefcountingHashMap(
+      std::function<std::unique_ptr<V>(const K&)> value_factory)
+      : value_factory_(std::move(value_factory)) {}
+
+  // Not copyable or movable because this contains internal pointers (namely,
+  // instances of Deleter contain pointers to `this` and into `map_`).
+  RefcountingHashMap(const RefcountingHashMap&) = delete;
+  RefcountingHashMap(RefcountingHashMap&&) = delete;
+  RefcountingHashMap& operator=(const RefcountingHashMap&) = delete;
+  RefcountingHashMap& operator=(RefcountingHashMap&&) = delete;
+
+  // Gets the value for the given key.
+  //
+  // If the map doesn't contain a live value for the key, constructs one
+  // according to the factory passed to the map's constructor.
+  std::shared_ptr<V> operator[](const K& key) {
+    absl::MutexLock lock(&mu_);
+    auto it = map_.find(key);
+    if (it == map_.end()) {
+      // Create entry in the map and then set its value, so the value can
+      // contain a pointer back into the map.
+      it = map_.emplace(key, std::weak_ptr<V>()).first;
+      std::shared_ptr<V> value(value_factory_(key).release(),
+                               Deleter{&it->first, this});
+      it->second = value;  // Set the weak ptr to the shared ptr.
+      return value;
+    }
+    return it->second.lock();
+  }
+
+  // Runs a function over every key/value in the map.
+  //
+  // Touching the map from within this function may deadlock; don't do it.
+  //
+  // Function signature must be compatible with
+  //   void fn(const K&, std::shared_ptr<V>)
+  //
+  template <typename Fn>
+  void ForEach(Fn&& fn) {
+    absl::MutexLock lock(&mu_);
+    for (const auto& kv : map_) {
+      fn(kv.first, kv.second.lock());
+    }
+  }
+
+ private:
+  struct Deleter {
+    const K* key;  // Points into parent->map_.
+    RefcountingHashMap* parent;
+
+    void operator()(V* v) {
+      delete v;
+      absl::MutexLock lock(&parent->mu_);
+      auto it = parent->map_.find(*key);
+      CHECK(it != parent->map_.end());
+      CHECK(it->second.expired());
+      parent->map_.erase(it);
+    }
+  };
+
+  std::function<std::unique_ptr<V>(const K&)> value_factory_;
+  absl::Mutex mu_;
+  absl::node_hash_map<K, std::weak_ptr<V>> map_ GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_REFCOUNTING_HASH_MAP_H_
diff --git a/tensorflow/compiler/xla/refcounting_hash_map_test.cc b/tensorflow/compiler/xla/refcounting_hash_map_test.cc
new file mode 100644
index 00000000000..65120ba3df4
--- /dev/null
+++ b/tensorflow/compiler/xla/refcounting_hash_map_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/refcounting_hash_map.h"
+
+#include <functional>
+
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace {
+
+struct DeleteNotifier {
+  DeleteNotifier() = default;
+  DeleteNotifier(const DeleteNotifier&) = delete;
+  DeleteNotifier& operator=(const DeleteNotifier&) = delete;
+  DeleteNotifier(DeleteNotifier&& o) noexcept : fn(std::move(o.fn)) {
+    o.fn = nullptr;
+  }
+  DeleteNotifier& operator=(DeleteNotifier&& o) noexcept {
+    fn = o.fn;
+    o.fn = nullptr;
+    return *this;
+  }
+
+  ~DeleteNotifier() {
+    if (fn) {
+      fn();
+    }
+  }
+
+  std::function<void()> fn;
+};
+
+TEST(RefcountingHashMapTest, PointerIdentity) {
+  RefcountingHashMap<int, int> m;
+  std::shared_ptr<int> a = m[0];
+  std::shared_ptr<int> b = m[0];
+  std::shared_ptr<int> c = m[1];
+  EXPECT_EQ(a.get(), b.get());
+  EXPECT_NE(a.get(), c.get());
+}
+
+TEST(RefcountingHashMapTest, DefaultInitialized) {
+  RefcountingHashMap<int, int> m;
+  EXPECT_EQ(*m[42], 0);
+}
+
+TEST(RefcountingHashMapTest, DeletesEagerly) {
+  RefcountingHashMap<int, DeleteNotifier> m;
+  bool deleted = false;
+  auto handle = m[0];
+  handle->fn = [&] { deleted = true; };
+  EXPECT_FALSE(deleted);
+  handle = nullptr;
+  EXPECT_TRUE(deleted);
+}
+
+TEST(RefcountingHashMapTest, CustomFactory) {
+  RefcountingHashMap<int, int> m(
+      [](const int& x) { return absl::make_unique<int>(x + 1); });
+  EXPECT_EQ(*m[0], 1);
+  EXPECT_EQ(*m[100], 101);
+}
+
+TEST(RefcountingHashMapTest, ForEachEmpty) {
+  RefcountingHashMap<int, int> m;
+  int64 count = 0;
+  m.ForEach([&](const int&, std::shared_ptr<int>) { ++count; });
+  EXPECT_EQ(count, 0);
+}
+
+TEST(RefcountingHashMapTest, ForEachNonempty) {
+  RefcountingHashMap<int, int> m;
+  auto a = m[0];
+  auto b = m[1];
+
+  std::vector<int> seen_keys;
+  std::vector<int*> seen_values;
+  m.ForEach([&](const int& k, std::shared_ptr<int> v) {
+    seen_keys.push_back(k);
+    seen_values.push_back(v.get());
+  });
+  EXPECT_THAT(seen_keys, testing::UnorderedElementsAre(0, 1));
+  EXPECT_THAT(seen_values, testing::UnorderedElementsAre(a.get(), b.get()));
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 08b78ee2448..59b60e2b9c5 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -161,24 +161,24 @@ ReferenceUtil::ReduceWindow1DGeneric(
     const std::function<float(float, float)>& reduce_func,
     absl::Span<const int64> window, absl::Span<const int64> stride,
     absl::Span<const std::pair<int64, int64>> padding) {
-  std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
-  std::vector<int64> window_counts(window.size(), 0);
-  std::vector<int64> pad_low(window.size(), 0);
-  for (int64 i = 0; i < window.size(); ++i) {
-    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
-    window_counts[i] =
-        window_util::StridedBound(padded_width, window[i], stride[i]);
-    pad_low[i] = padding[i].first;
-  }
-  auto result = absl::make_unique<std::vector<float>>(window_counts[0]);
+  CHECK_EQ(window.size(), 1);
+  CHECK_EQ(stride.size(), 1);
+  CHECK_EQ(padding.size(), 1);
+
+  int64 padded_width = padding[0].first + operand.size() + padding[0].second;
+  int64 stride_amount = stride[0];
+  int64 window_size = window[0];
+  int64 result_size =
+      window_util::StridedBound(padded_width, window_size, stride_amount);
+  int64 pad_low = padding[0].first;
+  auto result = absl::make_unique<std::vector<float>>(result_size);
 
   // Do a full 1D reduce window.
-  for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
-    int64 i0_base = i0 * stride[0] - pad_low[0];
-
+  for (int64 i0 = 0; i0 < result_size; ++i0) {
+    int64 i0_base = i0 * stride_amount - pad_low;
     float val = init;
-    for (int64 i0_win = 0; i0_win < window[0]; ++i0_win) {
-      if (i0_base + i0_win >= 0 && i0_base + i0_win < dim_lengths[0]) {
+    for (int64 i0_win = 0; i0_win < window_size; ++i0_win) {
+      if (i0_base + i0_win >= 0 && i0_base + i0_win < operand.size()) {
         val = reduce_func(val, operand[i0_base + i0_win]);
       }
     }
@@ -199,57 +199,6 @@ ReferenceUtil::ReduceWindow1DAdd(absl::Span<const float> operand, float init,
       xla::MakePadding(dim_lengths, window, stride, padding));
 }
 
-/* static */ std::unique_ptr<Array2D<float>>
-ReferenceUtil::ReduceWindow2DGeneric(
-    const Array2D<float>& operand, float init,
-    const std::function<float(float, float)>& reduce_func,
-    absl::Span<const int64> window, absl::Span<const int64> stride,
-    absl::Span<const std::pair<int64, int64>> padding) {
-  std::vector<int64> dim_lengths{operand.height(), operand.width()};
-
-  std::vector<int64> window_counts(window.size(), 0);
-  std::vector<int64> pad_low(window.size(), 0);
-  for (int64 i = 0; i < window.size(); ++i) {
-    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
-    window_counts[i] =
-        window_util::StridedBound(padded_width, window[i], stride[i]);
-    pad_low[i] = padding[i].first;
-  }
-  auto result =
-      absl::make_unique<Array2D<float>>(window_counts[0], window_counts[1]);
-
-  // Do a full 2D reduce window.
-  for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
-    for (int64 i1 = 0; i1 < window_counts[1]; ++i1) {
-      int64 i0_base = i0 * stride[0] - pad_low[0];
-      int64 i1_base = i1 * stride[1] - pad_low[1];
-
-      float val = init;
-      for (int64 i0_win = 0; i0_win < window[0]; ++i0_win) {
-        for (int64 i1_win = 0; i1_win < window[1]; ++i1_win) {
-          if (i0_base + i0_win >= 0 && i1_base + i1_win >= 0 &&
-              i0_base + i0_win < operand.n1() &&
-              i1_base + i1_win < operand.n2()) {
-            val = reduce_func(val, operand(i0_base + i0_win, i1_base + i1_win));
-          }
-        }
-      }
-      (*result)(i0, i1) = val;
-    }
-  }
-  return result;
-}
-
-/* static  */ std::unique_ptr<Array2D<float>> ReferenceUtil::ReduceWindow2DAdd(
-    const Array2D<float>& operand, float init, absl::Span<const int64> window,
-    absl::Span<const int64> stride, Padding padding) {
-  const auto add_reduce = [](float arg1, float arg2) { return arg1 + arg2; };
-  std::vector<int64> dim_lengths{operand.height(), operand.width()};
-  return ReduceWindow2DGeneric(
-      operand, init, add_reduce, window, stride,
-      xla::MakePadding(dim_lengths, window, stride, padding));
-}
-
 /* static  */ std::unique_ptr<Array3D<float>> ReferenceUtil::ReduceWindow3DAdd(
     const Array3D<float>& operand, float init, absl::Span<const int64> window,
     absl::Span<const int64> stride, Padding padding) {
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 8654fbb9b5e..00920aa8e6a 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -180,9 +180,6 @@ class ReferenceUtil {
       absl::Span<const float> operand, float init,
       absl::Span<const int64> window, absl::Span<const int64> stride,
       Padding padding);
-  static std::unique_ptr<Array2D<float>> ReduceWindow2DAdd(
-      const Array2D<float>& operand, float init, absl::Span<const int64> window,
-      absl::Span<const int64> stride, Padding padding);
   static std::unique_ptr<Array3D<float>> ReduceWindow3DAdd(
       const Array3D<float>& operand, float init, absl::Span<const int64> window,
       absl::Span<const int64> stride, Padding padding);
@@ -196,11 +193,6 @@ class ReferenceUtil {
       const std::function<float(float, float)>& reduce_func,
       absl::Span<const int64> window, absl::Span<const int64> stride,
       absl::Span<const std::pair<int64, int64>> padding);
-  static std::unique_ptr<Array2D<float>> ReduceWindow2DGeneric(
-      const Array2D<float>& operand, float init,
-      const std::function<float(float, float)>& reduce_func,
-      absl::Span<const int64> window, absl::Span<const int64> stride,
-      absl::Span<const std::pair<int64, int64>> padding);
   static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
       const Array4D<float>& operand, float init,
       const std::function<float(float, float)>& reduce_func,
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 26affbcceb3..a0cb479fbdc 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 1e7a924e350..be917d6763b 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -9,9 +9,10 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = [":friends"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
@@ -402,6 +403,27 @@ tf_cc_test(
     ],
 )
 
+xla_test(
+    name = "dynamic_update_slice_test",
+    srcs = ["dynamic_update_slice_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service/cpu:cpu_executable",
+        "//tensorflow/compiler/xla/service/cpu:parallel_task_assignment",
+        "//tensorflow/compiler/xla/service/cpu:target_machine_features",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 tf_cc_test(
     name = "dfs_hlo_visitor_with_default_test",
     srcs = ["dfs_hlo_visitor_with_default_test.cc"],
@@ -857,6 +879,7 @@ cc_library(
     name = "shaped_buffer",
     srcs = ["shaped_buffer.cc"],
     hdrs = ["shaped_buffer.h"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
@@ -1127,6 +1150,9 @@ cc_library(
         ":buffer_value_containers",
         ":heap_simulator",
         ":hlo",
+        ":hlo_alias_analysis",
+        ":hlo_buffer",
+        ":hlo_dataflow_analysis",
         ":hlo_proto",
         ":logical_buffer",
         ":tuple_points_to_analysis",
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index ea56c75b2f2..cc501161ce9 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 
@@ -237,7 +238,7 @@ Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
   Allocation& allocation = it->second;
   TF_RET_CHECK(allocation.ref_count >= 1);
   if (allocation.ref_count == 1) {
-    allocation.device_memory.Free();
+    TF_RETURN_IF_ERROR(allocation.device_memory.Free());
     allocation_map.erase(it);
   } else {
     allocation.ref_count--;
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index dbd89911d92..24f910caa7c 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -68,29 +68,31 @@ absl::optional<ArCrsCombiner::ArCrsPair> ArCrsCombiner::MatchesArCrsPattern(
            Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
   };
 
-  if (!instruction->IsCrossModuleAllReduce() ||
-      !computation_is_addition(instruction->called_computations()[0]) ||
-      instruction->user_count() != 1) {
-    return absl::nullopt;
-  }
-  auto next = instruction->users()[0];
-  int64 distance = 1;
-  while (!next->IsCrossReplicaAllReduce()) {
-    if (can_ar_move_past_instruction(next)) {
-      next = next->users()[0];
-    } else {
-      return absl::nullopt;
+  // We only support combining cross-partition all-reduce where each replica
+  // belongs to its own group, since the later cross-replica all-reduce combines
+  // along the replica dimension.
+  if (instruction->IsCrossModuleAllReduce() &&
+      instruction->replica_groups().size() == num_replicas_ &&
+      computation_is_addition(instruction->called_computations()[0]) &&
+      instruction->user_count() == 1) {
+    auto next = instruction->users()[0];
+    int64 distance = 1;
+    while (!next->IsCrossReplicaAllReduce()) {
+      if (can_ar_move_past_instruction(next)) {
+        next = next->users()[0];
+      } else {
+        return absl::nullopt;
+      }
+      ++distance;
+    }
+    if (!Cast<HloAllReduceInstruction>(next)->IsNoop() &&
+        computation_is_addition(next->called_computations()[0])) {
+      ArCrsPair pair(instruction, next, distance);
+      VLOG(2) << "ArCrsPair matching pattern: " << pair.ToString();
+      return pair;
     }
-    ++distance;
-  }
-  if (!Cast<HloAllReduceInstruction>(next)->IsNoop() &&
-      computation_is_addition(next->called_computations()[0])) {
-    ArCrsPair pair(instruction, next, distance);
-    VLOG(2) << "ArCrsPair matching pattern: " << pair.ToString();
-    return pair;
-  } else {
-    return absl::nullopt;
   }
+  return absl::nullopt;
 }
 
 absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
@@ -238,7 +240,7 @@ bool ArCrsCombiner::TupleElementsComputeSameValue(
 /* static */
 bool ArCrsCombiner::TestInstructionsComputeSameValue(HloInstruction* i1,
                                                      HloInstruction* i2) {
-  ArCrsCombiner combiner(/*num_spatial_partitions=*/2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/1);
   auto module = i1->parent()->parent();
   CHECK_EQ(module, i2->parent()->parent());
   combiner.call_graph_ = CallGraph::Build(module);
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index 4d17d5d8a31..250252b6390 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -69,8 +69,9 @@ namespace xla {
 //
 class ArCrsCombiner : public HloModulePass {
  public:
-  ArCrsCombiner(int num_spatial_partitions)
-      : num_spatial_partitions_(num_spatial_partitions) {}
+  ArCrsCombiner(int num_spatial_partitions, int num_replicas)
+      : num_spatial_partitions_(num_spatial_partitions),
+        num_replicas_(num_replicas) {}
   absl::string_view name() const override { return "ar-crs-combiner"; }
   StatusOr<bool> Run(HloModule* module) override;
 
@@ -160,6 +161,8 @@ class ArCrsCombiner : public HloModulePass {
 
   int num_spatial_partitions_;
 
+  int num_replicas_;
+
   // Map from all-reduce ids to the AR/CRS pairs.
   absl::flat_hash_map<int64, std::vector<ArCrsPair>> all_reduce_map_;
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index 0ea26f63b95..0be31899d53 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -452,7 +452,7 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -520,7 +520,7 @@ ENTRY %entrycomp (p: f32[2,1]) -> (f32[2], f32[2]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -587,7 +587,7 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -668,7 +668,7 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -750,7 +750,7 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(module_str));
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_FALSE(changed);
 }
@@ -810,7 +810,7 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -884,7 +884,7 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -957,7 +957,7 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -1047,7 +1047,7 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -1139,7 +1139,7 @@ ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
   auto crs_before =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_before = crs_before->replica_groups();
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -1217,7 +1217,7 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(module_str));
-  ArCrsCombiner combiner(2);
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
   auto changed = combiner.Run(module.get()).ValueOrDie();
   EXPECT_FALSE(changed);
 }
@@ -1264,5 +1264,37 @@ ENTRY Parameters1.v4 {
   EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(f0, f1));
 }
 
+TEST_F(ArCrsCombinerTest, AllReduceWithReplicas) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
+  %p = bf16[] parameter(0)
+  %all-reduce.0 = f32[] all-reduce(%p), all_reduce_id=1, replica_groups={{0,1}},
+    to_apply=%sum.f32, sharding={maximal device=0}
+  %all-reduce.1 = f32[] all-reduce(%p), all_reduce_id=1, replica_groups={{0,1}},
+    to_apply=%sum.f32, sharding={maximal device=1}
+  %all-reduce.2 = f32[] all-reduce(%all-reduce.0), replica_groups={{0,1}},
+    to_apply=%sum.f32, sharding={maximal device=0}
+  %all-reduce.3 = f32[] all-reduce(%all-reduce.1), replica_groups={{0,1}},
+    to_apply=%sum.f32, sharding={maximal device=1}
+  ROOT %tuple = (f32[], f32[]) tuple(%all-reduce.2, %all-reduce.3),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2, /*num_replicas=*/2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index d859f647ea0..40283d12314 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -126,16 +126,11 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
     : platform_(platform),
       compiler_(compiler),
       transfer_manager_(transfer_manager),
-      computation_placer_(computation_placer) {
-  // The given set of stream executors set may include invalid executors.
-  for (se::StreamExecutor* exec : stream_executors) {
-    if (exec != nullptr) {
-      stream_executors_.push_back(exec);
-    }
-  }
+      computation_placer_(computation_placer),
+      stream_executors_(stream_executors.begin(), stream_executors.end()) {
   // Create a memory allocator for the valid stream executors.
   memory_allocator_ = absl::make_unique<se::StreamExecutorMemoryAllocator>(
-      platform, stream_executors);
+      platform, stream_executors_);
   CHECK(!stream_executors_.empty())
       << "Service found no devices for backend " << platform_->Name() << '.';
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index aa57f28448e..5cbe6c44622 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <algorithm>
 #include <deque>
 #include <ostream>
-#include <queue>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"
@@ -32,6 +31,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -50,30 +51,6 @@ using absl::StrAppend;
 using absl::StrAppendFormat;
 using ::tensorflow::strings::HumanReadableNumBytes;
 
-template <typename T>
-string ColocatedBufferSetsToString(const T& container, const char* title) {
-  string result;
-  StrAppend(&result, title, "\n");
-  for (const auto& it : container) {
-    StrAppend(&result, "\t", it->ToString(), "\n");
-  }
-  return result;
-}
-
-// Checks that points-to set of 'instruction' is unambiguous and distinct
-// (ensured by CopyInsertion), then adds the buffer from the points-to set at
-// 'index' to 'colocated_set'.
-const LogicalBuffer* AddBufferToColocatedSet(
-    const HloInstruction* instruction, const ShapeIndex& index,
-    const TuplePointsToAnalysis& points_to_analysis,
-    std::vector<const LogicalBuffer*>* colocated_set) {
-  // CopyInsertion ensures root points-to set is unambiguous and distinct.
-  const auto& points_to = points_to_analysis.GetPointsToSet(instruction);
-  DCHECK(!points_to.IsAmbiguous());
-  colocated_set->push_back(points_to.element(index)[0]);
-  return colocated_set->back();
-}
-
 // Given the interference map of a graph (the list of interfering node indices
 // for each node), perform graph coloring such that interfering nodes are
 // assigned to different colors. Returns the assigned color of the nodes, where
@@ -226,14 +203,15 @@ string BufferAllocation::Slice::ToString() const {
 }
 
 BufferAllocation::Slice BufferAllocation::GetSlice(
-    const LogicalBuffer& buffer) const {
+    const BufferValue& buffer) const {
   const OffsetSize os = FindOrDie(assigned_buffers_, &buffer);
   return Slice(this, os.offset, os.size);
 }
 
-void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
+void BufferAllocation::AddAssignment(const BufferValue& buffer, int64 offset,
                                      int64 size) {
-  VLOG(4) << "Trying to add " << buffer << " to allocation #" << index();
+  VLOG(4) << "Adding the following buffer to allocation #" << index() << ": "
+          << buffer;
   CHECK(!assigned_buffers_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
@@ -306,15 +284,14 @@ string BufferAllocation::ToString() const {
   }
   StrAppend(&output, ":\n");
   // Dump the assigned buffers ordered by id.
-  std::vector<const LogicalBuffer*> sorted_buffers;
+  std::vector<const BufferValue*> sorted_buffers;
   for (const auto& buffer_offset_size : assigned_buffers_) {
     sorted_buffers.push_back(buffer_offset_size.first);
   }
-  absl::c_sort(sorted_buffers,
-               [](const LogicalBuffer* a, const LogicalBuffer* b) {
-                 return a->id() < b->id();
-               });
-  for (const LogicalBuffer* buffer : sorted_buffers) {
+  absl::c_sort(sorted_buffers, [](const BufferValue* a, const BufferValue* b) {
+    return a->id() < b->id();
+  });
+  for (const BufferValue* buffer : sorted_buffers) {
     const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer);
     StrAppend(&output, absl::StrFormat(
                            "  %s [%d,%d]: %s\n", buffer->ToString(),
@@ -339,28 +316,37 @@ const PointsToSet& BufferAssignment::GetPointsToSet(
   return points_to_analysis().GetPointsToSet(instruction);
 }
 
-bool BufferAssignment::HasAllocation(const LogicalBuffer& buffer) const {
-  TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
-  return allocation_index_for_buffer_.contains(&buffer);
+bool BufferAssignment::HasAllocation(const BufferValue& value) const {
+  return allocation_index_for_value_.contains(&value);
+}
+
+bool BufferAssignment::HasAllocation(const HloBuffer& buffer) const {
+  return allocation_index_for_value_.contains(buffer.values()[0]);
 }
 
 const BufferAllocation& BufferAssignment::GetAssignedAllocation(
-    const LogicalBuffer& buffer) const {
-  CHECK(HasAllocation(buffer));
-  return GetAllocation(allocation_index_for_buffer_.at(&buffer));
+    const BufferValue& value) const {
+  CHECK(HasAllocation(value));
+  return GetAllocation(allocation_index_for_value_.at(&value));
+}
+
+const BufferAllocation& BufferAssignment::GetAssignedAllocation(
+    const HloBuffer& hlo_buffer) const {
+  return GetAssignedAllocation(*hlo_buffer.values()[0]);
 }
 
 BufferAllocation* BufferAssignment::GetMutableAssignedAllocation(
-    const LogicalBuffer& buffer) {
+    const HloBuffer& buffer) {
   return const_cast<BufferAllocation*>(&GetAssignedAllocation(buffer));
 }
 
 std::set<BufferAllocation::Slice> BufferAssignment::GetAllSlices(
     const HloInstruction* instruction, const ShapeIndex& index) const {
   std::set<BufferAllocation::Slice> result;
-  for (const LogicalBuffer* buffer : GetSourceBuffers(instruction, index)) {
-    if (HasAllocation(*buffer)) {
-      result.insert(GetAssignedAllocation(*buffer).GetSlice(*buffer));
+  for (const BufferValue* value :
+       dataflow_analysis().GetValueSet(instruction, index).values()) {
+    if (HasAllocation(*value)) {
+      result.insert(GetAssignedAllocation(*value).GetSlice(*value));
     }
   }
   return result;
@@ -375,15 +361,15 @@ const BufferAllocation& BufferAssignment::GetAllocation(
 
 const BufferAllocation* BufferAssignment::GetInstructionAllocation(
     const HloInstruction* hlo, const ShapeIndex& shape_index) const {
-  const PointsToSet& points_to_set = points_to_analysis().GetPointsToSet(hlo);
-  const LogicalBuffer* buffer = points_to_set.element(shape_index)[0];
+  const BufferValue* value =
+      dataflow_analysis().GetValueSet(hlo, shape_index).values()[0];
 
-  if (!HasAllocation(*buffer)) {
+  if (!HasAllocation(*value)) {
     return nullptr;
   }
 
   const BufferAllocation& instruction_allocation =
-      GetAssignedAllocation(*buffer);
+      GetAssignedAllocation(*value);
   return &instruction_allocation;
 }
 
@@ -394,9 +380,9 @@ BufferAllocation* BufferAssignment::GetMutableAllocation(
 
 bool BufferAssignment::HasAllocationAt(const HloInstruction* instruction,
                                        const ShapeIndex& index) const {
-  for (const LogicalBuffer* buffer :
-       GetPointsToSet(instruction).element(index)) {
-    if (allocation_index_for_buffer_.contains(buffer)) {
+  for (const BufferValue* value :
+       dataflow_analysis().GetValueSet(instruction, index).values()) {
+    if (allocation_index_for_value_.contains(value)) {
       return true;
     }
   }
@@ -413,13 +399,13 @@ StatusOr<BufferAllocation::Slice> BufferAssignment::GetUniqueSlice(
   VLOG(3) << "Trying to find unique slice for " << instruction->name() << " ["
           << index << "]";
   BufferAllocation::Slice result;
-  for (const LogicalBuffer* buffer :
-       GetPointsToSet(instruction).element(index)) {
-    VLOG(3) << "Examining buffer " << *buffer;
-    if (HasAllocation(*buffer)) {
+  for (const BufferValue* value :
+       dataflow_analysis().GetValueSet(instruction, index).values()) {
+    VLOG(3) << "Examining value " << *value;
+    if (HasAllocation(*value)) {
       VLOG(3) << "Has allocation";
       const BufferAllocation::Slice slice =
-          GetAssignedAllocation(*buffer).GetSlice(*buffer);
+          GetAssignedAllocation(*value).GetSlice(*value);
       if (result.allocation() == nullptr) {
         result = slice;
       } else if (result != slice) {
@@ -500,39 +486,55 @@ BufferAllocation* BufferAssignment::NewEmptyAllocation(
   return allocation;
 }
 
-BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
+BufferAllocation* BufferAssignment::NewAllocation(const HloBuffer& buffer,
                                                   int64 size) {
   BufferAllocation* allocation = NewEmptyAllocation(size, buffer.color());
   AddAssignment(allocation, buffer, /*offset=*/0, size);
-  allocation->peak_buffers_.push_back(&buffer);
+  allocation->peak_buffers_.push_back(buffer.values()[0]);
   return allocation;
 }
 
-// Adds an instruction to the set assigned to the given buffer.
 void BufferAssignment::AddAssignment(BufferAllocation* allocation,
-                                     const LogicalBuffer& buffer, int64 offset,
+                                     const HloBuffer& buffer, int64 offset,
                                      int64 size) {
-  CHECK(!allocation_index_for_buffer_.contains(&buffer))
-      << "LogicalBuffer " << buffer << " already has an allocation.";
   CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty())
       << "Non-reusable allocation already assigned a buffer: "
       << allocation->ToString();
 
-  TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
+  for (const BufferValue* buffer_value : buffer.values()) {
+    CHECK(!allocation_index_for_value_.contains(buffer_value))
+        << "BufferValue " << buffer_value << " already has an allocation.";
+    allocation->AddAssignment(*buffer_value, offset, size);
+    allocation_index_for_value_[buffer_value] = allocation->index();
+  }
 
-  allocation->AddAssignment(buffer, offset, size);
-  if (liveness().MaybeLiveOut(buffer)) {
+  if (alias_analysis().BufferLivesOut(buffer)) {
+    VLOG(3) << "HloBuffer lives out" << buffer.ToString();
+    VLOG(3) << "Set maybe live out: " << allocation->ToString();
+    allocation->set_maybe_live_out(true);
+  }
+}
+
+void BufferAssignment::AddAssignment(BufferAllocation* allocation,
+                                     const BufferValue& value, int64 offset,
+                                     int64 size) {
+  allocation->AddAssignment(value, offset, size);
+  allocation_index_for_value_[&value] = allocation->index();
+  const HloValue& hlo_value =
+      *CHECK_NOTNULL(dynamic_cast<const HloValue*>(&value));
+  if (alias_analysis().ValueLivesOut(hlo_value)) {
+    VLOG(3) << "HloValue lives out: " << hlo_value.ToString();
+    VLOG(3) << "Set maybe live out: " << allocation->ToString();
     allocation->set_maybe_live_out(true);
   }
-  allocation_index_for_buffer_[&buffer] = allocation->index();
 }
 
 // Combines allocations of temporary buffers of the same color into one big
 // BufferAllocation.
 void BufferAssignment::CombineTempAllocations() {
   VLOG(1) << "CombineTempAllocations()";
-  flat_hash_map<LogicalBuffer::Color, BufferAllocation,
-                LogicalBuffer::Color::Hasher>
+  flat_hash_map<BufferValue::Color, BufferAllocation,
+                BufferValue::Color::Hasher>
       combined_allocation_map;
 
   // Move all temp allocations into a single run at the end of the allocations
@@ -548,7 +550,7 @@ void BufferAssignment::CombineTempAllocations() {
   if (first_temp_it != allocations_.end()) {
     for (auto it = first_temp_it; it != allocations_.end(); ++it) {
       const BufferAllocation& temp_allocation = *it;
-      LogicalBuffer::Color color = temp_allocation.color();
+      BufferValue::Color color = temp_allocation.color();
       auto combined_it = combined_allocation_map.find(color);
       if (combined_it == combined_allocation_map.end()) {
         // We have found the first temp allocation of this color. Collect
@@ -571,15 +573,16 @@ void BufferAssignment::CombineTempAllocations() {
           RoundUpToNearest(combined_allocation->size(), alignment);
       combined_allocation->set_size(base + temp_allocation.size());
       for (const auto& buffer_offset_size : temp_allocation.assigned_buffers_) {
-        const LogicalBuffer* buffer = buffer_offset_size.first;
+        const BufferValue* value = buffer_offset_size.first;
         const int64 offset = buffer_offset_size.second.offset;
         const int64 size = buffer_offset_size.second.size;
-        combined_allocation->AddAssignment(*buffer, base + offset, size);
+        combined_allocation->AddAssignment(*value, base + offset, size);
       }
       if (!temp_allocation.HeapTraces().empty()) {
         CHECK_EQ(temp_allocation.HeapTraces().size(), 1);
         combined_allocation->AddHeapTrace(temp_allocation.HeapTraces().front());
       }
+
       combined_allocation->peak_buffers_.insert(
           combined_allocation->peak_buffers_.end(),
           temp_allocation.peak_buffers_.begin(),
@@ -595,14 +598,14 @@ void BufferAssignment::CombineTempAllocations() {
   }
 
   // Update allocation indices to their new positions.
-  allocation_index_for_buffer_.erase(allocation_index_for_buffer_.begin(),
-                                     allocation_index_for_buffer_.end());
+  allocation_index_for_value_.erase(allocation_index_for_value_.begin(),
+                                    allocation_index_for_value_.end());
   for (size_t index = 0; index < allocations_.size(); ++index) {
     BufferAllocation* allocation = &allocations_[index];
     allocation->set_index(index);
     for (const auto& buffer_offset_size : allocation->assigned_buffers_) {
-      const LogicalBuffer* buffer = buffer_offset_size.first;
-      allocation_index_for_buffer_[buffer] = index;
+      const BufferValue* value = buffer_offset_size.first;
+      allocation_index_for_value_[value] = index;
     }
   }
 }
@@ -694,30 +697,28 @@ string BufferAssignment::ToString() const {
 
 BufferAssignmentProto BufferAssignment::ToProto() const {
   BufferAssignmentProto proto;
-  // NOTE: TuplePointsToAnalysis state is serialized here in BufferAssigment,
+  // NOTE: DataflowAnalysis state is serialized here in BufferAssignment,
   // because we need to do the HasAllocation check for each buffer. Otherwise
   // the buffer_size_ call might fail for some backends.
-  const TuplePointsToAnalysis& points_to_analysis =
-      liveness_->points_to_analysis();
-  for (LogicalBuffer::Id id = 0; id < points_to_analysis.num_logical_buffers();
-       id++) {
-    auto& buffer = points_to_analysis.logical_buffer(id);
-    if (HasAllocation(buffer)) {
-      LogicalBufferProto proto_buffer = buffer.ToProto(buffer_size_);
+  const HloDataflowAnalysis& dataflow = this->dataflow_analysis();
+  for (BufferValue::Id id = 0; id < dataflow.values().size(); id++) {
+    auto& value = dataflow.values().at(id);
+    if (HasAllocation(*value)) {
+      LogicalBufferProto proto_buffer = value->ToProto(buffer_size_);
       proto.add_logical_buffers()->Swap(&proto_buffer);
 
       // Fill buffer aliases.
-      for (const BufferAlias& alias :
-           points_to_analysis.GetBufferAliases(buffer)) {
-        if (alias.instruction() == buffer.instruction() &&
-            alias.index() == buffer.index()) {
+      for (const HloValue* alias :
+           alias_analysis().GetBufferContainingValue(*value).values()) {
+        if (alias->instruction() == value->instruction() &&
+            alias->index() == value->index()) {
           continue;  // skip self-aliases
         }
         BufferAssignmentProto::BufferAlias* proto_alias =
             proto.add_buffer_aliases();
         LogicalBufferProto::Location proto_alias_location =
-            BufferValue::ToLocationProto(*alias.instruction(), alias.index());
-        proto_alias->set_source_buffer_id(buffer.id());
+            BufferValue::ToLocationProto(*alias->instruction(), alias->index());
+        proto_alias->set_source_buffer_id(value->id());
         proto_alias->mutable_location()->Swap(&proto_alias_location);
       }
     }
@@ -735,114 +736,70 @@ BufferAssignmentProto BufferAssignment::ToProto() const {
 /* static */
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    LogicalBuffer::SizeFunction buffer_size,
+    BufferValue::SizeFunction buffer_size,
     LogicalBuffer::AlignmentFunction color_alignment,
-    bool allow_input_output_aliasing, bool allocate_buffers_for_constants,
-    BufferLiveness::Colorer colorer, ReuseAllocationFunction reuse_checker,
-    ReuseColocatedAllocationForTempChecker reuse_colocated_checker) {
+    bool allocate_buffers_for_constants, BufferAssigner::Colorer colorer,
+    const absl::flat_hash_set<HloOpcode>& reuse_checker,
+    HloDataflowAnalysis::FusionCanShareBufferFunction fusion_can_share_buffer) {
   BufferAssigner assigner(allocate_buffers_for_constants, std::move(colorer),
-                          std::move(reuse_checker),
-                          std::move(reuse_colocated_checker));
-  return assigner.CreateAssignment(module, std::move(hlo_ordering),
-                                   std::move(buffer_size),
-                                   std::move(color_alignment));
+                          reuse_checker);
+  return assigner.CreateAssignment(
+      module, std::move(hlo_ordering), std::move(buffer_size),
+      std::move(color_alignment), std::move(fusion_can_share_buffer));
 }
 
 namespace {
 
-// a and b are in different subcomputations. Check for the case
-// where a is inside the while body, and b is outside, part of the same while's
-// init-operand or while-result.
-bool MayInterfereAcrossSubcomputations(BufferAssignment* assignment,
-                                       const LogicalBuffer& a_buffer,
-                                       const LogicalBuffer& b_buffer) {
-  const CallGraph& call_graph =
-      assignment->liveness().hlo_ordering().call_graph();
-  const HloInstruction* a_ancestor;
-  const HloInstruction* b_ancestor;
-  std::tie(a_ancestor, b_ancestor) =
-      call_graph.NearestAncestorsInSameComputation(a_buffer.instruction(),
-                                                   b_buffer.instruction());
-  if (a_ancestor == nullptr) {
-    // No common ancestor.
-    return true;
-  }
-  if (a_ancestor->opcode() == HloOpcode::kWhile &&
-      call_graph.InstructionIsNestedIn(a_buffer.instruction(),
-                                       a_ancestor->while_body())) {
-    const PointsToSet& init_set =
-        assignment->liveness().points_to_analysis().GetPointsToSet(
-            a_ancestor->operand(0));
-    if (init_set.ContainsBuffer(b_buffer)) {
-      VLOG(4) << "Can't interfere: " << a_buffer << " and " << b_buffer
-              << " (part of while-operand)";
-      return false;
-    }
-    const PointsToSet& while_set =
-        assignment->liveness().points_to_analysis().GetPointsToSet(a_ancestor);
-    if (while_set.ContainsBuffer(b_buffer)) {
-      VLOG(4) << "Can't interfere: " << a_buffer << " and " << b_buffer
-              << " (part of while)";
-      return false;
+void ConvertHeapSimulatorResultToHloValue(
+    HeapSimulator::Result* result, const HloDataflowAnalysis& dataflow_analysis,
+    const TuplePointsToAnalysis& points_to) {
+  absl::flat_hash_map<const BufferValue*, HeapSimulator::Chunk>
+      chunk_map_with_hlo_value;
+  for (auto& value_to_chunk : result->chunk_map) {
+    const BufferValue* value = value_to_chunk.first;
+    if (!dataflow_analysis.ValueIsDefinedAt(value->instruction(),
+                                            value->index())) {
+      continue;
     }
+    const HloValue& hlo_value = dataflow_analysis.GetValueDefinedAt(
+        value->instruction(), value->index());
+    chunk_map_with_hlo_value[&hlo_value] = value_to_chunk.second;
   }
-  return true;
-}
-
-// Return true, if a and b can't possibly interfere (and therefore further
-// checking for interference can be skipped). This function checks for special
-// cases where copy insertion guarantees no interference, but the regular buffer
-// liveness is too conservative:
-//
-// Operations inside a while-body can't interfere with operations outside the
-// while op if their last use is at the while-loop itself as part of the
-// while-init op, or the while-result.  For ops that are live across a
-// while-loop, copy insertion will already insert the necessary copies to avoid
-// such interference.
-//
-// This allows sharing buffers in cases like this:
-// init = {...}
-// while (init):
-//  p = param(0)
-//  gte = get-tuple-element(p), index=i
-//  t1 = op1 (gte)
-//  t2 = op2 (t1)
-//  ROOT tuple = {..., t2, ...}
-//
-// where t1 and t2 can share the same buffer.
-bool MaySkipInterferenceCheck(BufferAssignment* assignment,
-                              const LogicalBuffer& a_buffer,
-                              const LogicalBuffer& b_buffer) {
-  if (a_buffer.instruction()->parent() == b_buffer.instruction()->parent()) {
-    // Ops within the same computation are not handled here. Assume that they
-    // may interfere.
-    return false;
+  result->chunk_map = chunk_map_with_hlo_value;
+  // Set up debug trace.
+  for (int64 i = 0; i < result->debug_trace.events_size(); ++i) {
+    int64 buffer_id = result->debug_trace.mutable_events(i)->buffer_id();
+    const LogicalBuffer& logical_buffer = points_to.GetBuffer(buffer_id);
+    const HloValue* hlo_value =
+        dataflow_analysis
+            .GetValueSet(logical_buffer.instruction(), logical_buffer.index())
+            .values()[0];
+    result->debug_trace.mutable_events(i)->set_buffer_id(hlo_value->id());
   }
-  return !MayInterfereAcrossSubcomputations(assignment, a_buffer, b_buffer) ||
-         !MayInterfereAcrossSubcomputations(assignment, b_buffer, a_buffer);
 }
 
 }  // namespace
 
 bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
-                                       const LogicalBuffer& buffer,
+                                       const HloBuffer& hlo_buffer,
                                        BufferAssignment* assignment) {
-  const LogicalBuffer::SizeFunction& buffer_size = assignment->buffer_size_;
+  CHECK(!assignment->HasAllocation(hlo_buffer))
+      << "buffer " << hlo_buffer << " already has an allocation assigned.";
 
-  CHECK(!assignment->HasAllocation(buffer))
-      << "buffer " << buffer << " already has an allocation assigned.";
+  VLOG(4) << "Trying to assign " << hlo_buffer << " size "
+          << assignment->HloBufferSize(hlo_buffer)
+          << " to allocation: " << *allocation;
 
-  VLOG(4) << "Trying to assign " << buffer << " to allocation: " << *allocation;
-
-  if (buffer.color() != allocation->color()) {
-    VLOG(4) << "Can't assign: buffer has color" << buffer.color()
+  if (hlo_buffer.color() != allocation->color()) {
+    VLOG(4) << "Can't assign: buffer has color" << hlo_buffer.color()
             << " and allocation has color " << allocation->color() << ".";
     return false;
   }
 
-  if (buffer_size(buffer) > allocation->size()) {
+  if (assignment->HloBufferSize(hlo_buffer) > allocation->size()) {
     VLOG(4) << "Can't assign: buffer is larger than allocation ("
-            << buffer_size(buffer) << " > " << allocation->size() << ")";
+            << assignment->HloBufferSize(hlo_buffer) << " > "
+            << allocation->size() << ")";
     return false;
   }
 
@@ -851,10 +808,33 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     return false;
   }
 
-  if (reuse_checker_ != nullptr &&
-      !reuse_checker_(*assignment, *allocation, buffer)) {
-    VLOG(4) << "Can't assign: reuse_checker_(allocation, buffer) == false";
-    return false;
+  if (!must_not_live_out_.empty()) {
+    if (allocation->maybe_live_out()) {
+      // If a buffer maybe live out, the allocation cannot contain any node from
+      // the "must_not_live_out_" set.
+      for (const HloValue* value : hlo_buffer.values()) {
+        if (must_not_live_out_.count(value->instruction()->opcode()) > 0) {
+          VLOG(4) << "Can't assign: " << value->instruction()->ToString()
+                  << " cannot live out of the module";
+          return false;
+        }
+      }
+    }
+    // The above check is not enough -- There could be the case where an
+    // allocation can be not live out and contains an instruction with opcode
+    // from the "must_not_live_out_" set, but assigning a live out buffer to
+    // that allocation makes the allocation live out and also contains
+    // instruction from the "must_not_live_out_" set.
+    if (assignment->alias_analysis().BufferLivesOut(hlo_buffer)) {
+      for (const auto& buffer_offset_size : allocation->assigned_buffers()) {
+        if (must_not_live_out_.count(
+                buffer_offset_size.first->instruction()->opcode()) > 0) {
+          VLOG(4) << "Can't assign: " << buffer_offset_size.first->instruction()
+                  << " cannot live out of the module";
+          return false;
+        }
+      }
+    }
   }
 
   if (!allocation->is_reusable()) {
@@ -863,299 +843,218 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
   }
 
   for (const auto& buffer_offset_size : allocation->assigned_buffers()) {
-    const LogicalBuffer& assigned_buffer = *buffer_offset_size.first;
-    if (MaySkipInterferenceCheck(assignment, buffer, assigned_buffer)) {
-      continue;
-    }
-    if (assignment->liveness().MayInterfere(assigned_buffer, buffer)) {
-      VLOG(4) << "Can't assign: assignee " << assigned_buffer
-              << " may interfere with " << buffer;
-      return false;
-    }
-    // Copy instruction don't share a buffer with their input operand.
-    if (buffer.instruction()->IsUserOf(assigned_buffer.instruction()) &&
-        buffer.instruction()->opcode() == HloOpcode::kCopy) {
-      VLOG(4) << "Can't assign: assignee " << assigned_buffer
-              << " is used at copy instruction " << buffer;
-      return false;
+    // Pairwise compare.
+    const HloValue& assigned_buffer =
+        *CHECK_NOTNULL(dynamic_cast<const HloValue*>(buffer_offset_size.first));
+    for (const HloValue* new_value : hlo_buffer.values()) {
+      if (assignment->liveness().hlo_ordering().MayInterfere(
+              assigned_buffer, *new_value, assignment->dataflow_analysis())) {
+        VLOG(4) << "Can't assign: assignee " << assigned_buffer
+                << " may interfere with " << new_value;
+        return false;
+      }
+
+      for (const HloPosition& assgiend_buffer_position :
+           assigned_buffer.positions()) {
+        // Copy instruction don't share a buffer with their input operand.
+        if (new_value->instruction()->IsUserOf(
+                assgiend_buffer_position.instruction) &&
+            new_value->instruction()->opcode() == HloOpcode::kCopy) {
+          VLOG(4) << "Can't assign: assignee " << assigned_buffer
+                  << " is used at copy instruction " << new_value;
+          return false;
+        }
+      }
     }
   }
 
   // If the buffer is live out of the computation then it should only be
   // assigned a buffer which exactly fits the result to avoid wasting memory
   // (result buffers can have arbitrary lifetimes).
-  if (assignment->liveness().MaybeLiveOut(buffer) &&
-      allocation->size() != buffer_size(buffer)) {
-    VLOG(4) << "Can't assign: buffer " << buffer
+  if (assignment->alias_analysis().BufferLivesOut(hlo_buffer) &&
+      allocation->size() != assignment->HloBufferSize(hlo_buffer)) {
+    VLOG(4) << "Can't assign: buffer " << hlo_buffer
             << "is live out and size not the same as allocation";
     return false;
   }
 
-  assignment->AddAssignment(allocation, buffer, /*offset=*/0,
-                            buffer_size(buffer));
+  assignment->AddAssignment(allocation, hlo_buffer, /*offset=*/0,
+                            assignment->HloBufferSize(hlo_buffer));
   return true;
-}
+}  // namespace xla
 
-Status BufferAssigner::AssignBuffersForComputation(
-    const HloComputation* computation, bool is_thread_local,
-    const flat_hash_set<const LogicalBuffer*>& colocated_buffers,
-    const flat_hash_set<BufferAllocation::Index>& colocated_allocations,
-    flat_hash_map<const HloComputation*, flat_hash_set<const LogicalBuffer*>>*
-        buffers_to_assign_sequentially,
-    BufferAssignment* assignment) {
-  // Buffers are sorted and assigned to BufferAllocations in decreasing order of
-  // size.
-  std::vector<const LogicalBuffer*> sorted_buffers;
-  for (auto* instruction : computation->instructions()) {
-    // Add all buffers which this instruction defines. Instruction which don't
-    // define buffers (eg, bitcast which just forwards a pointer) don't need
-    // any allocations.
-    for (const LogicalBuffer* buffer :
-         assignment->points_to_analysis().GetBuffersDefinedByInstruction(
-             instruction)) {
-      sorted_buffers.push_back(buffer);
-    }
-  }
-
-  // Generate a post order sort of instructions for sorting of the
-  // LogicalBuffers.
-  flat_hash_map<const HloInstruction*, int> post_order_position;
-  int position = 0;
-  for (auto* instruction : computation->MakeInstructionPostOrder()) {
-    post_order_position.emplace(instruction, position);
-    position++;
-  }
-
-  // If there is a sequential instruction ordering, we'll delay assignment of
-  // temp buffers until after the main assignment loop.
-  const BufferLiveness& liveness = assignment->liveness();
-  const bool has_sequential_order =
-      liveness.hlo_ordering().SequentialOrder(*computation) != nullptr;
-  if (has_sequential_order && buffers_to_assign_sequentially != nullptr) {
-    // Every sequential computation must get an entry in the
-    // buffers_to_assign_sequentially map, even if we end up with an empty set
-    // of buffers. This ensures we can correctly determine whether to run
-    // whole-module heap simulation.
-    buffers_to_assign_sequentially->emplace(
-        computation, flat_hash_set<const LogicalBuffer*>());
-  }
-
-  // Sort the LogicalBuffers first by size. We assign the larger LogicalBuffers
-  // first for simplicity. This means any previously created BufferAllocation is
-  // necessarily large enough to hold the output of the current Buffer in
-  // consideration.
+Status BufferAssigner::MergeInplaceOpBuffers(BufferAssignment* assignment) {
+  // Try allocate same buffer for dynamic update slice's operand and output.
   //
-  // As a secondary sorting criteria, if the instructions are sequentially
-  // ordered, we assign live-out buffers before others. Note that for sequential
-  // computations, we'll take temp buffers that can't re-use any allocations and
-  // assign them via a heap scheduler. By assigning live-out buffers first, we
-  // increase the odds that temp buffers can re-use an allocation.
-  //
-  // As a final tiebreaker use post order position of the HLO instruction which
-  // defines the buffer. This means an instruction will appear after its
-  // operands (assuming operands are the same/larger size) enabling the
-  // important reuse case where an elementwise instruction reuses one of its
-  // operand's buffer. This improves locality.
-  absl::c_sort(sorted_buffers,
-               [has_sequential_order, &liveness, &post_order_position,
-                assignment](const LogicalBuffer* a, const LogicalBuffer* b) {
-                 // Primary sort is by decreasing buffer size.
-                 const int64 a_size = assignment->buffer_size_(*a);
-                 const int64 b_size = assignment->buffer_size_(*b);
-                 if (a_size != b_size) {
-                   return a_size > b_size;  // use ">" for decreasing size.
-                 }
-                 // Otherwise live out buffers come before others, if the
-                 // instructions are sequentially ordered.
-                 if (has_sequential_order) {
-                   const bool a_live_out = liveness.MaybeLiveOut(*a);
-                   const bool b_live_out = liveness.MaybeLiveOut(*b);
-                   if (a_live_out != b_live_out) {
-                     return a_live_out;
-                   }
-                 }
-                 // Final tiebreaker is in instruction post order.
-                 return post_order_position.at(a->instruction()) <
-                        post_order_position.at(b->instruction());
-               });
+  // TODO(yunxing): Moving this logic to alias analysis and add must-alias rule
+  // to operations that can be done in place.
+  for (HloComputation* computation : assignment->module().computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (!(instruction->opcode() == HloOpcode::kDynamicUpdateSlice ||
+            (instruction->opcode() == HloOpcode::kFusion &&
+             (instruction->fused_expression_root()->opcode() ==
+              HloOpcode::kDynamicUpdateSlice)))) {
+        continue;
+      }
+      if (instruction->parent()->IsFusionComputation()) {
+        continue;
+      }
+      if (instruction->operand_count() == 0) {
+        continue;
+      }
+      // Can't share the buffer.
+      if (!assignment->dataflow_analysis().CanShareOperandBufferWithUser(
+              instruction->mutable_operand(0), {}, instruction, {})) {
+        continue;
+      }
+      HloBuffer& instruction_buffer =
+          assignment->alias_analysis().GetUniqueBufferAt(instruction, {});
 
-  // BufferAllocations are necessarily created in decreasing size order. Keep
-  // indices of previously created BufferAllocations in new_allocation_indices.
-  std::vector<BufferAllocation::Index> new_allocation_indices;
+      HloBuffer& operand_buffer =
+          assignment->alias_analysis().GetUniqueBufferAt(
+              instruction->operand(0), {});
 
-  // A sorted multimap from size to indices of colocated allocations.
-  std::multimap<int64, BufferAllocation::Index>
-      colocated_allocation_size_to_indices;
-  {
-    std::priority_queue<BufferAllocation::Index> sorted_colocated_indices;
-    for (auto index : colocated_allocations) {
-      bool consider_reusing = true;
-      // Output tuple table may be allocated at run-time, so make sure we don't
-      // overwrite them.
-      for (const auto& buffer_offset_size :
-           assignment->GetAllocation(index).assigned_buffers()) {
-        if (buffer_offset_size.first->shape().IsTuple()) {
-          consider_reusing = false;
-          break;
+      // Already have the same buffer. No need to merge those.
+      if (instruction_buffer.id() == operand_buffer.id()) {
+        continue;
+      }
+
+      bool interfere = false;
+
+      for (const HloValue* instruction_value : instruction_buffer.values()) {
+        for (const HloValue* operand_value : operand_buffer.values()) {
+          if (assignment->liveness().hlo_ordering().MayInterfere(
+                  *instruction_value, *operand_value,
+                  assignment->dataflow_analysis())) {
+            interfere = true;
+            break;
+          }
         }
       }
-      if (consider_reusing) {
-        sorted_colocated_indices.push(index);
+      if (interfere) {
+        continue;
       }
-    }
-    while (!sorted_colocated_indices.empty()) {
-      auto index = sorted_colocated_indices.top();
-      sorted_colocated_indices.pop();
-      colocated_allocation_size_to_indices.emplace(
-          assignment->GetAllocation(index).size(), index);
+      if (assignment->alias_analysis().BufferLivesOut(instruction_buffer)) {
+        continue;
+      }
+      if (instruction_buffer.color() != operand_buffer.color()) {
+        continue;
+      }
+      VLOG(3) << "Merging inplace " << instruction_buffer << " and "
+              << operand_buffer;
+      assignment->alias_analysis().MergeBuffers(instruction_buffer,
+                                                operand_buffer);
     }
   }
-  for (const LogicalBuffer* buffer : sorted_buffers) {
-    VLOG(3) << "Assigning allocation to: " << *buffer;
-    if (colocated_buffers.contains(buffer)) {
-      // Colocated buffers are currently assigned in an earlier pass.
-      VLOG(3) << "Skipping colocated buffer: " << *buffer;
-      continue;
-    }
+  return Status::OK();
+}
 
-    TF_RET_CHECK(!assignment->HasAllocation(*buffer));
-
-    const HloInstruction* instruction = buffer->instruction();
-    const int64 buffer_size = assignment->buffer_size_(*buffer);
-
-    if (instruction->opcode() == HloOpcode::kConstant) {
+Status BufferAssigner::AssignSingleHloBuffer(
+    const HloBuffer* hlo_buffer, bool is_thread_local,
+    absl::flat_hash_map<const HloComputation*,
+                        absl::flat_hash_set<const BufferValue*>>*
+        buffers_to_assign_sequentially,
+    std::vector<BufferAllocation::Index>* allocation_indices,
+    BufferAssignment* assignment) {
+  const int64 buffer_size = assignment->HloBufferSize(*hlo_buffer);
+  for (const HloValue* value : hlo_buffer->values()) {
+    if (value->instruction()->opcode() == HloOpcode::kConstant) {
       if (allocate_buffers_for_constants_) {
         BufferAllocation* allocation =
-            assignment->NewAllocation(*buffer, buffer_size);
+            assignment->NewAllocation(*hlo_buffer, buffer_size);
         allocation->set_constant(true);
         VLOG(3) << "New allocation #" << allocation->index() << " for constant "
-                << *buffer;
+                << *hlo_buffer << " value ptr: " << value;
       }
-      continue;
+      VLOG(3) << "Not allocating buffer for constant";
+      return Status::OK();
     }
 
+    const HloInstruction* instruction = value->instruction();
     const bool is_entry_parameter =
         instruction->opcode() == HloOpcode::kParameter &&
-        computation == computation->parent()->entry_computation();
+        instruction->parent() ==
+            instruction->parent()->parent()->entry_computation();
+
     if (is_entry_parameter) {
-      // If the LogicalBuffer is part of an external parameter, creates a new
+      bool parameter_has_alias =
+          assignment->module().input_output_alias_config().ParameterHasAlias(
+              instruction->parameter_number(), value->index());
+      // If the hlo buffer is part of an external parameter, creates a new
       // allocation and sets its parameter number. Parameters of non-entry
       // computations do not need special allocations because they live inside
       // callers.
       BufferAllocation* allocation =
-          assignment->NewAllocation(*buffer, buffer_size);
-      bool parameter_has_alias =
-          assignment->module().input_output_alias_config().ParameterHasAlias(
-              instruction->parameter_number(), buffer->index());
+          assignment->NewAllocation(*hlo_buffer, buffer_size);
+
       allocation->set_entry_computation_parameter(
-          instruction->parameter_number(), buffer->index(),
-          parameter_has_alias);
-      VLOG(3) << "Mark allocation #" << allocation->index()
-              << " as entry computation parameter: " << *buffer;
-      continue;
-    }
-
-    if (is_thread_local) {
-      BufferAllocation* allocation =
-          assignment->NewAllocation(*buffer, buffer_size);
-      allocation->set_is_thread_local(true);
+          instruction->parameter_number(), value->index(), parameter_has_alias);
+      if (parameter_has_alias) {
+        allocation_indices->push_back(allocation->index());
+      }
       VLOG(3) << "New allocation #" << allocation->index()
-              << " for thread-local: " << *buffer;
-      continue;
+              << " marked as entry computation parameter: " << *hlo_buffer;
+      return Status::OK();
     }
+  }
 
-    if (buffer->shape().IsTuple()) {
+  if (is_thread_local) {
+    BufferAllocation* allocation =
+        assignment->NewAllocation(*hlo_buffer, buffer_size);
+    allocation->set_is_thread_local(true);
+    VLOG(3) << "New allocation #" << allocation->index()
+            << " for thread-local: " << *hlo_buffer;
+    return Status::OK();
+  }
+
+  for (const HloValue* value : hlo_buffer->values()) {
+    if (value->shape().IsTuple()) {
       BufferAllocation* allocation =
-          assignment->NewAllocation(*buffer, buffer_size);
+          assignment->NewAllocation(*hlo_buffer, buffer_size);
       allocation->set_is_tuple(true);
       VLOG(3) << "New allocation #" << allocation->index()
-              << " for tuple-shaped buffer: " << *buffer;
-      continue;
+              << " for tuple-shaped buffer: " << *hlo_buffer;
+      return Status::OK();
     }
 
-    // First try to assign a LogicalBuffer to one of its operand allocations to
-    // improve locality. This is only possible with elementwise operations
-    // (checked in liveness analysis) which are necessarily top-level
-    // array-shaped buffers.
-    if (buffer->IsTopLevel() && !buffer->IsTuple()) {
+    if (value->IsTopLevel() && !value->IsTuple()) {
+      const HloInstruction* instruction = value->instruction();
       for (auto* operand : instruction->operands()) {
-        bool assigned_operand = false;
         for (const auto& operand_slice :
              assignment->GetAllSlices(operand, /*index=*/{})) {
           BufferAllocation* allocation =
               assignment->GetMutableAllocation(operand_slice.index());
-          if (!colocated_allocations.contains(allocation->index())) {
-            // TODO(b/32491382) Colocated buffers are currently assigned in an
-            // earlier pass, and so can break the "increasing allocation size"
-            // invariant in this function (causing this CHECK to fail). However,
-            // the call to MaybeAssignBuffer is safe as it returns false if
-            // allocation.size < buffer.size.
-            CHECK_GE(allocation->size(), buffer_size);
-          }
-          if (MaybeAssignBuffer(allocation, *buffer, assignment)) {
+          if (MaybeAssignBuffer(allocation, *hlo_buffer, assignment)) {
             VLOG(3) << "Reusing (operand) allocation #" << allocation->index()
-                    << " for: " << *buffer;
-            assigned_operand = true;
-            break;
+                    << " for: " << *hlo_buffer;
+            return Status::OK();
           }
         }
-        if (assigned_operand) {
-          break;
-        }
       }
     }
+  }
 
-    if (reuse_colocated_checker_ != nullptr &&
-        reuse_colocated_checker_(*buffer, buffer_size) &&
-        !assignment->HasAllocation(*buffer)) {
-      // Find the smallest buffer which can be reused iterating from the lower
-      // bound of the buffer size in colocated_allocation_size_to_indices.
-      auto it = colocated_allocation_size_to_indices.lower_bound(buffer_size);
-      while (it != colocated_allocation_size_to_indices.end()) {
-        CHECK_GE(it->first, buffer_size);
-        BufferAllocation* allocation =
-            assignment->GetMutableAllocation(it->second);
-        if (MaybeAssignBuffer(allocation, *buffer, assignment)) {
-          VLOG(3) << "Reusing allocation #" << allocation->index()
-                  << " for: " << *buffer;
-          // We remove the assigned allocation from
-          // colocated_allocation_size_to_indices to prevent putting too many
-          // buffers into collocated allocations, and to reduce the search space
-          // for subsequent buffers. This is to avoid excessive pairwise checks
-          // for interference that may slow down compilation. The heap simulator
-          // is more efficient in live range checks.
-          //
-          // Another benefit of removing the allocation is that the reused
-          // allocation will be less likely to contain interferences that
-          // prevent operand-output reuse, which is important for in-place
-          // dynamic update slices.
-          colocated_allocation_size_to_indices.erase(it);
-          break;
-        }
-        ++it;
-      }
+  // Find the smallest buffer which can be reused iterating from end of
+  // allocation_indices (smallest) to beginning (largest).
+  for (int allocation_index = allocation_indices->size() - 1;
+       allocation_index >= 0; allocation_index--) {
+    BufferAllocation* allocation = assignment->GetMutableAllocation(
+        allocation_indices->at(allocation_index));
+    if (MaybeAssignBuffer(allocation, *hlo_buffer, assignment)) {
+      VLOG(3) << "Reusing allocation #" << allocation->index()
+              << " for: " << *hlo_buffer;
+      return Status::OK();
     }
+  }
 
-    if (!assignment->HasAllocation(*buffer)) {
-      // Find the smallest buffer which can be reused iterating from end of
-      // new_allocation_indices (smallest) to beginning (largest).
-      for (int allocation_index = new_allocation_indices.size() - 1;
-           allocation_index >= 0; allocation_index--) {
-        BufferAllocation* allocation = assignment->GetMutableAllocation(
-            new_allocation_indices[allocation_index]);
-        // Instructions are iterated in increasing buffer size, so any
-        // previously create allocation must be large enough to hold this
-        // instruction's output.
-        if (MaybeAssignBuffer(allocation, *buffer, assignment)) {
-          VLOG(3) << "Reusing allocation #" << allocation->index()
-                  << " for: " << *buffer;
-          break;
-        }
-      }
-    }
-
-    if (!assignment->HasAllocation(*buffer) && has_sequential_order &&
-        !liveness.MaybeLiveOut(*buffer)) {
+  if (hlo_buffer->values().size() == 1) {
+    HloComputation* computation =
+        hlo_buffer->values()[0]->instruction()->parent();
+    const bool has_sequential_order =
+        assignment->liveness().hlo_ordering().SequentialOrder(*computation) !=
+        nullptr;
+    if (!assignment->HasAllocation(*hlo_buffer) && has_sequential_order &&
+        !assignment->alias_analysis().BufferLivesOut(*hlo_buffer)) {
       // There is a sequential instruction ordering, so we delay assignment of
       // temp buffers until after the loop. We do this right before we decide to
       // create a new allocation, to ensure we've exhausted all the buffer
@@ -1164,30 +1063,124 @@ Status BufferAssigner::AssignBuffersForComputation(
       // Entry parameters and thread local buffers were already handled earlier
       // in this loop iteration.  See BufferAllocation::IsPreallocatedTempBuffer
       // for the definition of temp buffers.
-      CHECK(!is_entry_parameter) << *buffer;
-      CHECK(!is_thread_local) << *buffer;
-      (*buffers_to_assign_sequentially)[computation].insert(buffer);
-      VLOG(3) << "Delaying assignment of temp buffer: " << *buffer;
-      continue;
-    }
-
-    if (!assignment->HasAllocation(*buffer)) {
-      BufferAllocation* allocation =
-          assignment->NewAllocation(*buffer, buffer_size);
-      new_allocation_indices.push_back(allocation->index());
-      VLOG(3) << "New allocation #" << allocation->index()
-              << " for: " << *buffer;
+      (*buffers_to_assign_sequentially)[computation].insert(
+          hlo_buffer->values()[0]);
+      VLOG(3) << "Delaying assignment of temp buffer: " << *hlo_buffer;
+      return Status::OK();
     }
   }
 
+  if (!assignment->HasAllocation(*hlo_buffer)) {
+    BufferAllocation* allocation =
+        assignment->NewAllocation(*hlo_buffer, buffer_size);
+    allocation_indices->push_back(allocation->index());
+    VLOG(3) << "New allocation #" << allocation->index()
+            << " for: " << *hlo_buffer;
+  }
+
+  TF_RET_CHECK(assignment->HasAllocation(*hlo_buffer));
   return Status::OK();
 }
 
-flat_hash_map<LogicalBuffer::Color, flat_hash_set<const LogicalBuffer*>,
+Status BufferAssigner::AssignBuffersForComputations(
+    const std::vector<const HloComputation*>& computations,
+    bool is_thread_local,
+    absl::flat_hash_map<const HloComputation*,
+                        absl::flat_hash_set<const BufferValue*>>*
+        buffers_to_assign_sequentially,
+    BufferAssignment* assignment) {
+  if (computations.empty()) {
+    return Status::OK();
+  }
+  std::vector<const HloBuffer*> sorted_buffers;
+
+  const HloAliasAnalysis& alias_analysis = assignment->alias_analysis();
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    TF_RET_CHECK(!buffer.values().empty());
+    const HloComputation* comp = buffer.values()[0]->instruction()->parent();
+    if (absl::c_linear_search(computations, comp)) {
+      sorted_buffers.push_back(&buffer);
+    }
+  }
+
+  // Generate a post order sort of instructions for sorting of the
+  // HloBuffers.
+  flat_hash_map<const HloInstruction*, int> post_order_position;
+  int position = 0;
+  std::vector<const HloComputation*> reverse_post_order_computations;
+  std::unique_ptr<CallGraph> call_graph =
+      CallGraph::Build(computations[0]->parent());
+  TF_RETURN_IF_ERROR(call_graph->VisitNodes([&](const CallGraphNode& node) {
+    if (absl::c_linear_search(computations, node.computation())) {
+      reverse_post_order_computations.push_back(node.computation());
+    }
+    return Status::OK();
+  }));
+  absl::c_reverse(reverse_post_order_computations);
+  for (auto* computation : reverse_post_order_computations) {
+    for (auto* instruction : computation->MakeInstructionPostOrder()) {
+      post_order_position.emplace(instruction, position);
+      position++;
+    }
+  }
+
+  const BufferLiveness& liveness = assignment->liveness();
+  for (const HloComputation* computation : computations) {
+    const bool has_sequential_order =
+        liveness.hlo_ordering().SequentialOrder(*computation) != nullptr;
+    if (has_sequential_order && buffers_to_assign_sequentially != nullptr) {
+      // Every sequential computation must get an entry in the
+      // buffers_to_assign_sequentially map, even if we end up with an empty
+      // set of buffers. This ensures we can correctly determine whether to
+      // run whole-module heap simulation.
+      buffers_to_assign_sequentially->emplace(
+          computation, flat_hash_set<const BufferValue*>());
+    }
+  }
+
+  absl::c_sort(
+      sorted_buffers, [&post_order_position, &alias_analysis, assignment](
+                          const HloBuffer* a, const HloBuffer* b) {
+        // Primary sort is by decreasing buffer size.
+        const int64 a_size = assignment->HloBufferSize(*a);
+        const int64 b_size = assignment->HloBufferSize(*b);
+        if (a_size != b_size) {
+          return a_size > b_size;  // use ">" for decreasing size.
+        }
+
+        const bool a_live_out = alias_analysis.BufferLivesOut(*a);
+        const bool b_live_out = alias_analysis.BufferLivesOut(*b);
+        if (a_live_out != b_live_out) {
+          return a_live_out;
+        }
+        auto compare = [&post_order_position](const HloValue* value1,
+                                              const HloValue* value2) {
+          return post_order_position.at(value1->instruction()) <
+                 post_order_position.at(value2->instruction());
+        };
+        const HloValue* a_min = *absl::c_min_element(a->values(), compare);
+        const HloValue* b_min = *absl::c_min_element(b->values(), compare);
+        return post_order_position.at(a_min->instruction()) <
+               post_order_position.at(b_min->instruction());
+      });
+
+  std::vector<BufferAllocation::Index> allocation_indices;
+
+  for (const HloBuffer* buffer : sorted_buffers) {
+    VLOG(3) << "=================================================";
+    VLOG(3) << "Assigning buffer for " << *buffer;
+    TF_RETURN_IF_ERROR(AssignSingleHloBuffer(buffer, is_thread_local,
+                                             buffers_to_assign_sequentially,
+                                             &allocation_indices, assignment));
+  }
+  return Status::OK();
+}
+
+flat_hash_map<LogicalBuffer::Color, flat_hash_set<const BufferValue*>,
               LogicalBuffer::Color::Hasher>
 BufferAssigner::SplitBuffersByColor(
-    const flat_hash_set<const LogicalBuffer*>& buffers) {
-  flat_hash_map<LogicalBuffer::Color, flat_hash_set<const LogicalBuffer*>,
+    const flat_hash_set<const BufferValue*>& buffers) {
+  flat_hash_map<LogicalBuffer::Color, flat_hash_set<const BufferValue*>,
                 LogicalBuffer::Color::Hasher>
       color_map;
   for (auto buffer : buffers) {
@@ -1196,14 +1189,76 @@ BufferAssigner::SplitBuffersByColor(
   return color_map;
 }
 
+BufferValueFlatSet BufferAssigner::HloValueSetToLogicalBufferSet(
+    const absl::flat_hash_set<const BufferValue*>& hlo_value_set,
+    const TuplePointsToAnalysis& points_to_analysis) {
+  BufferValueFlatSet output;
+  for (const BufferValue* buffer_value : hlo_value_set) {
+    const HloValue& hlo_value =
+        *CHECK_NOTNULL(dynamic_cast<const HloValue*>(buffer_value));
+
+    for (const HloPosition& position : hlo_value.positions()) {
+      if (!points_to_analysis.InstructionDefinesBufferAtIndex(
+              position.instruction, position.index)) {
+        continue;
+      }
+      int64 buffer_id =
+          points_to_analysis
+              .GetBufferDefinedAt(position.instruction, position.index)
+              .ValueOrDie()
+              ->id();
+      LogicalBuffer& logical_buffer =
+          points_to_analysis.logical_buffer(buffer_id);
+      if (hlo_value.has_color()) {
+        logical_buffer.set_color(hlo_value.color());
+      }
+      output.insert(&logical_buffer);
+    }
+  }
+  return output;
+}
+
+std::vector<BufferValueFlatSet> BufferAssigner::BuildMustAliasLogicalBufferSet(
+    BufferAssignment* assignment) {
+  VLOG(1) << "Building must alias groups.";
+  std::vector<BufferValueFlatSet> output;
+  for (const HloBuffer& hlo_buffer : assignment->alias_analysis().buffers()) {
+    std::vector<HloPosition> positions = hlo_buffer.ComputePositions();
+    if (positions.size() <= 1) {
+      continue;
+    }
+    VLOG(2) << " Must alias group:";
+    BufferValueFlatSet must_alias;
+    for (const HloPosition& hlo_position : positions) {
+      VLOG(2) << "     hlo_position:" << hlo_position.ToString();
+
+      StatusOr<const LogicalBuffer*> logical_buffer =
+          assignment->points_to_analysis().GetBufferDefinedAt(
+              hlo_position.instruction, hlo_position.index);
+      if (!logical_buffer.ok()) {
+        // Buffer is not defined at this position.
+        continue;
+      }
+
+      VLOG(2) << "     logical buffer:"
+              << logical_buffer.ValueOrDie()->ToString();
+      must_alias.insert(logical_buffer.ValueOrDie());
+    }
+    if (must_alias.size() > 1) {
+      output.push_back(must_alias);
+    }
+  }
+  return output;
+}
+
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     const flat_hash_map<const HloComputation*,
-                        flat_hash_set<const LogicalBuffer*>>&
+                        flat_hash_set<const BufferValue*>>&
         buffers_to_assign_sequentially,
     bool run_whole_module_heap_simulation, BufferAssignment* assignment) {
-  // Run the sequence of instructions through the heap simulator.  The heuristic
-  // that seems to give the best results is lazy-best-fit, with all runs of
-  // alloc / free calls sorted in decreasing size order.
+  // Run the sequence of instructions through the heap simulator.  The
+  // heuristic that seems to give the best results is lazy-best-fit, with all
+  // runs of alloc / free calls sorted in decreasing size order.
   const HloOrdering& hlo_ordering = assignment->liveness().hlo_ordering();
 
   // Returns a heap algorithm that chooses the best result from several
@@ -1218,17 +1273,23 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     return absl::make_unique<ChooseBestHeapAlgorithm>(std::move(algorithms));
   };
 
+  // The API of heap simulator is currently logical buffer based and buffer
+  // assignment currently uses HloValue. As an intermediate step, we convert
+  // between logical buffer and HloValue around the API boundary.
+  //
+  // TODO(yunxing): Update heap simulator to use HloValue and remove the
+  // conversions.
   if (run_whole_module_heap_simulation) {
-    // Run the heap simulation over the whole module. This reduces memory usage,
-    // since buffers for kCall, kWhile, and kConditional sub-computations are
-    // only live for the duration of their calling instructions.
+    // Run the heap simulation over the whole module. This reduces memory
+    // usage, since buffers for kCall, kWhile, and kConditional
+    // sub-computations are only live for the duration of their calling
+    // instructions.
     VLOG(1) << "Running whole-module heap simulation";
     HloSchedule schedule(&assignment->module());
-    flat_hash_set<const LogicalBuffer*> all_buffers_to_assign;
+    flat_hash_set<const BufferValue*> all_buffers_to_assign;
     for (const auto& pair : buffers_to_assign_sequentially) {
       const HloComputation* computation = pair.first;
-      const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
-          pair.second;
+      const flat_hash_set<const BufferValue*>& buffers_to_assign = pair.second;
       const HloInstructionSequence* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
@@ -1243,15 +1304,22 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       int64 alignment = assignment->color_alignment_(color);
       HeapSimulator::Options options;
       options.alloc_constants = allocate_buffers_for_constants_;
-      BufferValueFlatSet buffer_value_set =
-          ToBufferValueFlatSet(single_colored_set.second);
+      // At the API boundary between buffer_assignment and heap simulator,
+      // TuplePointsTo and LogicalBuffer are expected.
+      BufferValueFlatSet buffer_value_set = HloValueSetToLogicalBufferSet(
+          single_colored_set.second, assignment->points_to_analysis());
       options.buffers_to_assign = &buffer_value_set;
+
+      options.must_alias_sets = BuildMustAliasLogicalBufferSet(assignment);
       TF_ASSIGN_OR_RETURN(
-          const HeapSimulator::Result result,
+          HeapSimulator::Result result,
           HeapSimulator::Run(get_heap_algorithm(alignment),
                              assignment->module(), schedule,
                              assignment->points_to_analysis(),
                              assignment->buffer_size_, options));
+      ConvertHeapSimulatorResultToHloValue(&result,
+                                           assignment->dataflow_analysis(),
+                                           assignment->points_to_analysis());
       AssignBuffersFromHeapSimulator(result, assignment,
                                      single_colored_set.first);
     }
@@ -1262,8 +1330,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     VLOG(1) << "Running per-computation heap simulation";
     for (const auto& pair : buffers_to_assign_sequentially) {
       const HloComputation* computation = pair.first;
-      const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
-          pair.second;
+      const flat_hash_set<const BufferValue*>& buffers_to_assign = pair.second;
       const HloInstructionSequence* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
@@ -1273,15 +1340,21 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         VLOG(2) << "Simulating heap for color " << color;
         int64 alignment = assignment->color_alignment_(color);
         HeapSimulator::Options options;
-        BufferValueFlatSet buffer_value_set =
-            ToBufferValueFlatSet(single_colored_set.second);
+        // At the API boundary between buffer_assignment and heap simulator,
+        // TuplePointsTo and LogicalBuffer are expected.
+        BufferValueFlatSet buffer_value_set = HloValueSetToLogicalBufferSet(
+            single_colored_set.second, assignment->points_to_analysis());
         options.buffers_to_assign = &buffer_value_set;
+        options.must_alias_sets = BuildMustAliasLogicalBufferSet(assignment);
         TF_ASSIGN_OR_RETURN(
-            const HeapSimulator::Result result,
+            HeapSimulator::Result result,
             HeapSimulator::Run(get_heap_algorithm(alignment), *computation,
                                *instruction_sequence,
                                assignment->points_to_analysis(),
                                assignment->buffer_size_, options));
+        ConvertHeapSimulatorResultToHloValue(&result,
+                                             assignment->dataflow_analysis(),
+                                             assignment->points_to_analysis());
         AssignBuffersFromHeapSimulator(result, assignment,
                                        single_colored_set.first);
       }
@@ -1291,35 +1364,37 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
 }
 
 namespace {
-
-// Computes and returns the set of logical buffers live at the point of maximal
-// liveness in the given heap trace. LogicalBuffers are (stabily) sorted by id.
-std::vector<const LogicalBuffer*> ComputePeakMemoryLogicalBuffers(
+// Computes and returns the set of logical buffers live at the point of
+// maximal liveness in the given heap trace. LogicalBuffers are (stabily)
+// sorted by id.
+std::vector<const BufferValue*> ComputePeakMemoryLogicalBuffers(
     const BufferAllocation& allocation, const HeapSimulatorTrace& heap_trace) {
   // Create a map from LogicalBuffer::Id to LogicalBuffer* for the logical
   // buffers in this allocation.
-  absl::flat_hash_map<LogicalBuffer::Id, const LogicalBuffer*> id_to_buffer;
-  absl::flat_hash_map<const LogicalBuffer*, int64> buffer_sizes;
+  absl::flat_hash_map<BufferValue::Id, const BufferValue*> id_to_value;
+  absl::flat_hash_map<const BufferValue*, int64> buffer_sizes;
   for (const auto& pair : allocation.assigned_buffers()) {
-    const LogicalBuffer* buffer = pair.first;
+    const BufferValue* value = pair.first;
     const BufferAllocation::OffsetSize& offset_size = pair.second;
-    id_to_buffer[buffer->id()] = buffer;
-    buffer_sizes[buffer] = offset_size.size;
+    id_to_value[value->id()] = value;
+    buffer_sizes[value] = offset_size.size;
   }
+  VLOG(1) << "Compute peak memory logical buffers";
 
   // Returns how much the given event increases the total size of live
   // buffers. Can be negative.
-  auto memory_delta = [&id_to_buffer, &buffer_sizes](
+  auto memory_delta = [&id_to_value, &buffer_sizes](
                           const HeapSimulatorTrace::Event& event) -> int64 {
-    const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
+    if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
+      // Sharing a buffer does not change the live set size for the purposes
+      // of the heap simulator. Even though the shared-with buffer may be
+      // smaller, the entire allocation remains live.
+      return 0;
+    }
+    const BufferValue* buffer = id_to_value.at(event.buffer_id());
     const int64 buffer_size = buffer_sizes.at(buffer);
     if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
       return buffer_size;
-    } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
-      // Sharing a buffer does not change the live set size for the purposes of
-      // the heap simulator. Even though the shared-with buffer may be smaller,
-      // the entire allocation remains live.
-      return 0;
     } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
       return -1 * buffer_size;
     }
@@ -1338,43 +1413,48 @@ std::vector<const LogicalBuffer*> ComputePeakMemoryLogicalBuffers(
 
   // Next gather the set of logical buffers live at the earliest point of
   // maximal live set size.
-  absl::flat_hash_set<const LogicalBuffer*> live_buffers;
+  absl::flat_hash_set<const BufferValue*> live_values;
   live_size = 0;
   for (const auto& event : heap_trace.events()) {
-    const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
-    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
-      InsertOrDie(&live_buffers, buffer);
-    } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
-      // Nothing to do.
-    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
-      CHECK(ContainsKey(live_buffers, buffer));
-      live_buffers.erase(buffer);
+    if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
+      continue;
+    }
+    const BufferValue* value = id_to_value.at(event.buffer_id());
+    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+      InsertOrDie(&live_values, value);
+    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
+      CHECK(ContainsKey(live_values, value));
+      live_values.erase(value);
     }
-
     live_size += memory_delta(event);
+
     if (live_size == max_live_size) {
       break;
     }
   }
   CHECK_EQ(live_size, max_live_size);
 
-  std::vector<const LogicalBuffer*> live_buffers_vector;
-  live_buffers_vector.insert(live_buffers_vector.end(), live_buffers.begin(),
-                             live_buffers.end());
+  std::vector<const BufferValue*> live_values_vector;
+  live_values_vector.insert(live_values_vector.end(), live_values.begin(),
+                            live_values.end());
 
   // Stabily sort the live buffers.
-  absl::c_sort(live_buffers_vector,
-               [](const LogicalBuffer* a, const LogicalBuffer* b) {
+  absl::c_sort(live_values_vector,
+               [](const BufferValue* a, const BufferValue* b) {
                  return a->id() < b->id();
                });
-  return live_buffers_vector;
+  VLOG(4) << "Peak memory buffer:";
+  for (auto value : live_values_vector) {
+    VLOG(4) << "  " << value->ToString();
+  }
+  return live_values_vector;
 }
 
 }  // namespace
 
 void BufferAssigner::AssignBuffersFromHeapSimulator(
     const HeapSimulator::Result& result, BufferAssignment* assignment,
-    LogicalBuffer::Color color) {
+    BufferValue::Color color) {
   if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
     assignment->stats_.preallocated_temp_fragmentation_bytes =
         result.fragmentation_size;
@@ -1386,499 +1466,96 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
   BufferAllocation* allocation =
       assignment->NewEmptyAllocation(result.heap_size, color);
   for (const auto& buffer_chunk : result.chunk_map) {
-    // TODO(lauj) Remove this down_cast after downstream users of
-    // BufferAllocation::assigned_buffers() are updated to use BufferValue.
-    const LogicalBuffer& buffer =
-        *CHECK_NOTNULL(dynamic_cast<const LogicalBuffer*>(buffer_chunk.first));
+    const BufferValue& value = *buffer_chunk.first;
     const HeapSimulator::Chunk& chunk = buffer_chunk.second;
-    assignment->AddAssignment(allocation, buffer, chunk.offset, chunk.size);
+    assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
   }
   allocation->peak_buffers_ =
       ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
 
-  VLOG(1) << "Ran heap simulation for allocation: " << allocation->ToString();
+  VLOG(1) << "Ran heap simulation for allocation: ";
+  XLA_VLOG_LINES(2, allocation->ToString());
+
   allocation->AddHeapTrace(result.debug_trace);
 }
 
-// Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
-// the invariant that all sets in 'colocated_buffer_sets' are disjoint.
-//
-// A practical example of when this is necessary is a chain of kCall ops:
-//   computation.entry
-//     %a = call() -> computation.1
-//   computation.1
-//     %b = call() -> computation.2
-//   computation.2
-//     %c = parameter()
-// This yields the logical sets {%a,%b} {%b,%c} {%c}, which need to be merged
-// into a single set {%a,%b,%c}
-void BufferAssigner::AddSetToColocatedBufferSets(
-    const std::vector<const LogicalBuffer*>& colocated_set,
-    std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
-  if (colocated_set.empty()) {
-    return;
-  }
-  VLOG(5) << ColocatedBufferSetsToString(colocated_set,
-                                         "Adding colocated buffer set");
-  // Find existing sets that overlap with at least one buffer from the
-  // colocated_set. The resulting 'overlap_set_indices' will have at most
-  // colocated_buffer_sets->size() entries, and will be in increasing order.
-  std::vector<size_t> overlap_set_indices;
-  for (size_t index = 0; index < colocated_buffer_sets->size(); ++index) {
-    for (const LogicalBuffer* buffer : colocated_set) {
-      if ((*colocated_buffer_sets)[index].contains(buffer)) {
-        VLOG(5) << "Found overlap with existing set on buffer "
-                << buffer->ToString() << "\n"
-                << ColocatedBufferSetsToString((*colocated_buffer_sets)[index],
-                                               "Overlapping set");
-        overlap_set_indices.push_back(index);
-        break;
-      }
-    }
-  }
-
-  // If there is no overlap with existing sets, create a new set.
-  if (overlap_set_indices.empty()) {
-    colocated_buffer_sets->emplace_back();
-    colocated_buffer_sets->back().insert(colocated_set.begin(),
-                                         colocated_set.end());
-    VLOG(5) << "No overlap found, new group created";
-    return;
-  }
-
-  // Merge all overlap sets and the colocated set into the first overlap set.
-  ColocatedBufferSet* first = &(*colocated_buffer_sets)[overlap_set_indices[0]];
-  for (size_t index = 1; index < overlap_set_indices.size(); ++index) {
-    const ColocatedBufferSet& overlap_set =
-        (*colocated_buffer_sets)[overlap_set_indices[index]];
-    first->insert(overlap_set.begin(), overlap_set.end());
-  }
-  first->insert(colocated_set.begin(), colocated_set.end());
-  VLOG(5) << ColocatedBufferSetsToString(
-      *first, "Result of the colocated buffer set merging");
-
-  // Remove overlap sets that we just merged. The offset accounts for the fact
-  // that as elements are erased, the indices need to be adjusted. Keep in mind
-  // that overlap_set_indices is in increasing order.
-  for (size_t index = 1; index < overlap_set_indices.size(); ++index) {
-    const size_t offset = overlap_set_indices[index] - index + 1;
-    colocated_buffer_sets->erase(colocated_buffer_sets->begin() + offset);
-  }
-}
-
-std::vector<BufferAssigner::ColocatedBufferSet>
-BufferAssigner::MergeColocatedBufferSets(
-    const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
-    const BufferLiveness& buffer_liveness,
-    const LogicalBuffer::SizeFunction& buffer_size) {
-  VLOG(1) << "colocation sets count before coalescing:"
-          << colocated_buffer_sets.size();
-
-  // Returns true if the given buffer is for the entry parameter.
-  auto is_readonly_entry_parameter = [](const LogicalBuffer& buffer) {
-    auto* instruction = buffer.instruction();
-    auto* computation = instruction->parent();
-    auto* module = computation->parent();
-    return instruction->opcode() == HloOpcode::kParameter &&
-           computation == module->entry_computation() &&
-           !module->input_output_alias_config().ParameterHasAlias(
-               instruction->parameter_number(), buffer.index());
-  };
-
-  std::vector<bool> set_can_be_merged(colocated_buffer_sets.size(), true);
-
-  // Do not merge if one of the sets includes live outs, entry parameters or
-  // constants.
-  //
-  // Buffer liveness does not report the correct live range for entry
-  // parameter and live out buffers so we have to special case them here.  On
-  // backends that support constant buffer allocations, constant buffers are
-  // assigned globals in readonly storage so we can't merge colocated buffer
-  // sets containing constants with colocated buffer sets containing writing
-  // instructions or other constants.
-  //
-  // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
-  // the caller of the executable so we can't write to entry parameters
-  // either, and the argument for not merging constants also applies to entry
-  // parameters.
-  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
-    for (auto& buffer : colocated_buffer_sets[i]) {
-      if (buffer_liveness.MaybeLiveOut(*buffer) ||
-          is_readonly_entry_parameter(*buffer) ||
-          buffer->instruction()->opcode() == HloOpcode::kConstant) {
-        set_can_be_merged[i] = false;
-        break;
-      }
-    }
-  }
-
-  // Returns true if the two colocated buffer sets (specified by their indices
-  // into the colocated_buffer_sets) can be merged into a single set.
-  auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
-                                   &buffer_size,
-                                   &set_can_be_merged](int64 i, int64 j) {
-    if (!set_can_be_merged[i] || !set_can_be_merged[j]) {
-      return true;
-    }
-
-    // Colocated sets satisfy the invariant that all buffers within a set have
-    // the same size. That means we need to check whether the size is the same
-    // between the two sets, but also that it's enough to look at just one
-    // buffer within each set.
-    if (buffer_size(**colocated_buffer_sets[i].begin()) !=
-        buffer_size(**colocated_buffer_sets[j].begin())) {
-      return true;
-    }
-
-    // Do not merge if some pair of buffers interferes with each other.
-    for (auto& buffer_a : colocated_buffer_sets[i]) {
-      for (auto& buffer_b : colocated_buffer_sets[j]) {
-        if (buffer_a->id() != buffer_b->id() &&
-            buffer_liveness.MayInterfere(*buffer_a, *buffer_b)) {
-          return true;
-        }
-      }
-    }
-
-    return false;
-  };
-
-  // Build the interference map among the colocated buffer sets (nodes), by
-  // adding an edge between any two nodes that cannot be merged into a single
-  // colocated buffer set.
-  std::vector<std::vector<int64>> interference_map(
-      colocated_buffer_sets.size());
-  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
-    for (int64 j = i + 1; j < colocated_buffer_sets.size(); ++j) {
-      if (cannot_merge_buffer_sets(i, j)) {
-        interference_map[i].push_back(j);
-        interference_map[j].push_back(i);
-      }
-    }
-  }
-
-  // Assign a color to each colocation set in colocated_buffer_sets, such that
-  // the sets that can be merged are assigned with the same color.
-  auto assigned_colors = ColorInterferenceGraph(interference_map);
-
-  // Merge the buffer sets with the same color.
-  CHECK(!assigned_colors.empty());
-  int64 num_sets =
-      *std::max_element(assigned_colors.begin(), assigned_colors.end()) + 1;
-  std::vector<ColocatedBufferSet> new_colocated_buffer_sets(num_sets);
-  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
-    const auto& buffer_set = colocated_buffer_sets[i];
-    new_colocated_buffer_sets[assigned_colors[i]].insert(buffer_set.begin(),
-                                                         buffer_set.end());
-  }
-
-  VLOG(1) << "colocation sets count after coalescing:"
-          << colocated_buffer_sets.size();
-  return new_colocated_buffer_sets;
-}
-
-// Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
-// in the same allocation (currently just supports kWhile, kCall, and
-// kConditional and input output aliasing).
-void BufferAssigner::BuildColocatedBufferSets(
-    const HloModule* module, const BufferLiveness& buffer_liveness,
-    const LogicalBuffer::SizeFunction& buffer_size,
-    std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
-  const TuplePointsToAnalysis& points_to_analysis =
-      buffer_liveness.points_to_analysis();
-
-  // Set up colocated buffer set for input and output.
-  VLOG(4) << "Input/Output Alias Config: ";
-  VLOG(4) << module->input_output_alias_config();
-  module->input_output_alias_config().ForEachAlias(
-      [&](const ShapeIndex& output_index,
-          const HloInputOutputAliasConfig::Alias& alias) {
-        std::vector<const LogicalBuffer*> colocated_set;
-        AddBufferToColocatedSet(module->entry_computation()->root_instruction(),
-                                output_index, points_to_analysis,
-                                &colocated_set);
-        AddBufferToColocatedSet(
-            module->entry_computation()->parameter_instruction(
-                alias.parameter_number),
-            alias.parameter_index, points_to_analysis, &colocated_set);
-        AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-      });
-
-  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
-    for (const HloInstruction* instruction :
-         computation->MakeInstructionPostOrder()) {
-      const HloOpcode opcode = instruction->opcode();
-      if (opcode == HloOpcode::kWhile) {
-        const HloInstruction* while_hlo = instruction;
-        ShapeUtil::ForEachSubshape(
-            while_hlo->shape(),
-            [this, while_hlo, &points_to_analysis, buffer_size,
-             colocated_buffer_sets](const Shape& /*subshape*/,
-                                    const ShapeIndex& index) {
-              std::vector<const LogicalBuffer*> colocated_set;
-              // Add while.init.
-              AddBufferToColocatedSet(while_hlo->operand(0), index,
-                                      points_to_analysis, &colocated_set);
-              // Add while.result.
-              AddBufferToColocatedSet(while_hlo, index, points_to_analysis,
-                                      &colocated_set);
-              // Add while.cond.parameter.
-              AddBufferToColocatedSet(
-                  while_hlo->while_condition()->parameter_instruction(0), index,
-                  points_to_analysis, &colocated_set);
-              // Add while.body.parameter.
-              AddBufferToColocatedSet(
-                  while_hlo->while_body()->parameter_instruction(0), index,
-                  points_to_analysis, &colocated_set);
-              // Add while.body.root.
-              AddBufferToColocatedSet(
-                  while_hlo->while_body()->root_instruction(), index,
-                  points_to_analysis, &colocated_set);
-              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-            });
-      } else if (opcode == HloOpcode::kCall) {
-        const HloInstruction* call_hlo = instruction;
-        const HloComputation* callee = call_hlo->to_apply();
-        const HloInstruction* root_hlo = callee->root_instruction();
-        for (int64 i = 0; i < call_hlo->operand_count(); i++) {
-          const HloInstruction* call_param = callee->parameter_instruction(i);
-          const HloInstruction* call_operand = call_hlo->operand(i);
-          ShapeUtil::ForEachSubshape(
-              call_operand->shape(),
-              [&](const Shape& /*subshape*/, const ShapeIndex& index) {
-                std::vector<const LogicalBuffer*> colocated_set;
-                AddBufferToColocatedSet(call_param, index, points_to_analysis,
-                                        &colocated_set);
-                AddBufferToColocatedSet(call_operand, index, points_to_analysis,
-                                        &colocated_set);
-                AddSetToColocatedBufferSets(colocated_set,
-                                            colocated_buffer_sets);
-              });
-        }
-        ShapeUtil::ForEachSubshape(
-            call_hlo->shape(),
-            [this, call_hlo, root_hlo, &points_to_analysis,
-             colocated_buffer_sets](const Shape& /*subshape*/,
-                                    const ShapeIndex& index) {
-              std::vector<const LogicalBuffer*> colocated_set;
-              // Add call.result.
-              AddBufferToColocatedSet(call_hlo, index, points_to_analysis,
-                                      &colocated_set);
-              // Add call.subcomputation.root.
-              AddBufferToColocatedSet(root_hlo, index, points_to_analysis,
-                                      &colocated_set);
-              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-            });
-      } else if (opcode == HloOpcode::kConditional) {
-        const HloInstruction* conditional = instruction;
-        ShapeUtil::ForEachSubshape(
-            conditional->shape(),
-            [this, conditional, &points_to_analysis, colocated_buffer_sets](
-                const Shape& /*subshape*/, const ShapeIndex& index) {
-              std::vector<const LogicalBuffer*> colocated_set;
-              // Add cond.result.
-              AddBufferToColocatedSet(conditional, index, points_to_analysis,
-                                      &colocated_set);
-              for (int j = 0; j < conditional->branch_count(); ++j) {
-                // Add each cond.branch_computation[j].root.
-                AddBufferToColocatedSet(
-                    conditional->branch_computation(j)->root_instruction(),
-                    index, points_to_analysis, &colocated_set);
-              }
-              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-            });
-
-        for (int j = 0; j < conditional->branch_count(); ++j) {
-          // Add branch_operand[j] (which is operand[j+1]) and
-          // cond.branch_computation[j].parameter(0) as a colocated
-          // buffer set. Note that this has to be done for each subshape in the
-          // branch_operand of the case.
-          ShapeUtil::ForEachSubshape(
-              conditional->operand(j + 1)->shape(),
-              [this, j, conditional, &points_to_analysis,
-               colocated_buffer_sets](const Shape& /*subshape*/,
-                                      const ShapeIndex& index) {
-                std::vector<const LogicalBuffer*> branch_set;
-                // Add cond.operand[j+1].
-                AddBufferToColocatedSet(conditional->operand(j + 1), index,
-                                        points_to_analysis, &branch_set);
-                // Add cond.branch_computation[j].parameter_instruction(0).
-                AddBufferToColocatedSet(
-                    conditional->branch_computation(j)->parameter_instruction(
-                        0),
-                    index, points_to_analysis, &branch_set);
-                AddSetToColocatedBufferSets(branch_set, colocated_buffer_sets);
-              });
-        }
-      }
-    }
-  }
-
-  if (colocated_buffer_sets->empty()) {
-    return;
-  }
-
-  int64 i = 0;
-  for (const auto& colocated_set : *colocated_buffer_sets) {
-    VLOG(4) << "Colocated set " << i++ << ":";
-    for (const auto& buffer : colocated_set) {
-      VLOG(4) << "  " << buffer->ToString();
-    }
-  }
-  // Try to find more coalescing opportunities among the colocated buffer sets.
-  //
-  // TODO(b/32491382): We should be able to remove this by using the
-  // module-level liveness analysis, which would let us directly detect buffer
-  // sharing opportunities between the while instruction buffer and the buffers
-  // from the predicate and body computation, as well as sharing across
-  // different while instructions.
-  std::vector<ColocatedBufferSet> new_colocated_buffer_sets =
-      MergeColocatedBufferSets(*colocated_buffer_sets, buffer_liveness,
-                               buffer_size);
-  std::swap(*colocated_buffer_sets, new_colocated_buffer_sets);
-}
-
-// Assigns all colocated buffer sets in 'colocated_buffer_sets' to the same
-// allocation in 'assignment'.
-void BufferAssigner::AssignColocatedBufferSets(
-    const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
-    BufferAssignment* assignment,
-    flat_hash_set<const LogicalBuffer*>* colocated_buffers,
-    flat_hash_set<BufferAllocation::Index>* colocated_allocations) {
-  for (const ColocatedBufferSet& colocated_buffer_set : colocated_buffer_sets) {
-    BufferAllocation* allocation = nullptr;
-    // Set 'entry_parameter_number' and 'entry_parameter_shape_idx' if entry
-    // param in 'colocated_buffer_set'.
-    int64 entry_parameter_number = -1;
-    const ShapeIndex* entry_parameter_shape_idx = nullptr;
-    bool is_constant = false;
-    for (const LogicalBuffer* buffer : colocated_buffer_set) {
-      const HloInstruction* instruction = buffer->instruction();
-      const HloComputation* computation = instruction->parent();
-      if (instruction->opcode() == HloOpcode::kParameter &&
-          computation == computation->parent()->entry_computation()) {
-        entry_parameter_number = instruction->parameter_number();
-        entry_parameter_shape_idx = &buffer->index();
-      } else if (instruction->opcode() == HloOpcode::kConstant) {
-        is_constant = true;
-      }
-    }
-
-    CHECK(!is_constant || entry_parameter_number == -1)
-        << "Copy insertion should have inserted copies to prevent this.";
-
-    for (const LogicalBuffer* buffer : colocated_buffer_set) {
-      const int64 buffer_size = assignment->buffer_size_(*buffer);
-      if (allocation == nullptr) {
-        // TODO(b/32491382) Avoid current trivial solution of using new
-        // allocations for each colocated buffer set. When liveness has
-        // module-level scope, we can allow buffers to be shared across
-        // computations (in some cases).
-        allocation = assignment->NewAllocation(*buffer, buffer_size);
-        if (is_constant) {
-          allocation->set_constant(true);
-        }
-        colocated_allocations->insert(allocation->index());
-      } else {
-        CHECK_EQ(buffer_size, allocation->size())
-            << "Buffer: " << *buffer << " size mismatch in colocated buffer "
-            << "allocation: " << *allocation;
-        assignment->AddAssignment(allocation, *buffer, /*offset=*/0,
-                                  buffer_size);
-      }
-      colocated_buffers->insert(buffer);
-    }
-
-    // If an allocation contains a parameter, set corresponding fields.
-    if (entry_parameter_number >= 0) {
-      bool parameter_has_alias =
-          assignment->module().input_output_alias_config().ParameterHasAlias(
-              entry_parameter_number, *entry_parameter_shape_idx);
-      allocation->set_entry_computation_parameter(entry_parameter_number,
-                                                  *entry_parameter_shape_idx,
-                                                  parameter_has_alias);
-    }
-  }
-}
-
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    LogicalBuffer::SizeFunction buffer_size,
-    LogicalBuffer::AlignmentFunction color_alignment) {
+    BufferValue::SizeFunction buffer_size,
+    LogicalBuffer::AlignmentFunction color_alignment,
+    HloDataflowAnalysis::FusionCanShareBufferFunction fusion_can_share_buffer) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness,
                       BufferLiveness::Run(module, std::move(hlo_ordering)));
 
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer));
+
   VLOG(1) << "Assigning buffers to module " << module->name();
   XLA_VLOG_LINES(2, module->ToString());
-  XLA_VLOG_LINES(3, liveness->ToString());
-  XLA_VLOG_LINES(3, liveness->points_to_analysis().ToString());
+  XLA_VLOG_LINES(3, alias_analysis->ToString());
+  XLA_VLOG_LINES(3, alias_analysis->dataflow_analysis().ToString());
+  VLOG(1) << "Number of buffers to assign: "
+          << alias_analysis->buffers().size();
 
   // Can't use absl::make_unique because BufferAssignment constructor is
   // private.
-  std::unique_ptr<BufferAssignment> assignment(
-      new BufferAssignment(module, std::move(liveness), std::move(buffer_size),
-                           std::move(color_alignment)));
+  std::unique_ptr<BufferAssignment> assignment(new BufferAssignment(
+      module, std::move(liveness), std::move(buffer_size),
+      std::move(color_alignment), std::move(alias_analysis)));
 
-  // Assign buffers with the tightest constraints first (colocated buffer sets).
-  // Once b/32491382 enables module-level liveness analysis, we may be able
-  // to assign colocated buffers (or at least reuse their allocation for
-  // buffers outside of the set) in AssignBuffersForComputation.
-  flat_hash_set<const LogicalBuffer*> colocated_buffers;
-  flat_hash_set<BufferAllocation::Index> colocated_allocations;
-  std::vector<ColocatedBufferSet> colocated_buffer_sets;
-  BuildColocatedBufferSets(module, assignment->liveness(),
-                           assignment->buffer_size_, &colocated_buffer_sets);
-  TF_RETURN_IF_ERROR(colorer_(assignment->liveness()));
+  TF_RETURN_IF_ERROR(colorer_(&assignment->alias_analysis(),
+                              assignment->liveness().hlo_ordering()));
   VLOG(3) << "After coloring:";
-  XLA_VLOG_LINES(3, assignment->points_to_analysis().ToString());
-
-  AssignColocatedBufferSets(colocated_buffer_sets, assignment.get(),
-                            &colocated_buffers, &colocated_allocations);
+  XLA_VLOG_LINES(3,
+                 assignment->alias_analysis().dataflow_analysis().ToString());
+  TF_RETURN_IF_ERROR(MergeInplaceOpBuffers(assignment.get()));
 
   std::vector<const HloComputation*> thread_local_computations;
   std::vector<const HloComputation*> global_computations;
   TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
       module, &thread_local_computations, &global_computations));
 
-  // First assign buffers for global computatations. Temporary buffers for
-  // sequential computations are collected in 'buffers_to_assign_sequentially'.
-  flat_hash_map<const HloComputation*, flat_hash_set<const LogicalBuffer*>>
+  // First assign buffers for global computations. Temporary buffers for
+  // sequential computations are collected in
+  // 'buffers_to_assign_sequentially'.
+  flat_hash_map<const HloComputation*, flat_hash_set<const BufferValue*>>
       buffers_to_assign_sequentially;
-  for (auto* computation : global_computations) {
-    TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation,
-        /*is_thread_local=*/false, colocated_buffers, colocated_allocations,
-        &buffers_to_assign_sequentially, assignment.get()));
-  }
+  TF_RETURN_IF_ERROR(AssignBuffersForComputations(
+      global_computations,
+      /*is_thread_local=*/false, &buffers_to_assign_sequentially,
+      assignment.get()));
   // Assign buffers with sequential ordering, if any. If all global computations
   // are sequential, we can run heap simuation on the whole module, which
   // reduces memory usage.
   const bool run_whole_module_heap_simulation =
       buffers_to_assign_sequentially.size() == global_computations.size();
+  VLOG(2) << "Running whole module heap simulation"
+          << run_whole_module_heap_simulation;
   TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
       buffers_to_assign_sequentially, run_whole_module_heap_simulation,
       assignment.get()));
 
+  std::vector<const HloComputation*> thread_local_computations_no_fusion;
   // Now assign buffers for thread-local computations. All LogicalBuffers get
   // their own BufferAllocation.
+
   for (auto* computation : thread_local_computations) {
     TF_RET_CHECK(computation != module->entry_computation());
     if (computation->IsFusionComputation()) {
       continue;
     }
-    TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation,
-        /*is_thread_local=*/true, colocated_buffers, colocated_allocations,
-        /*buffers_to_assign_sequentially=*/nullptr, assignment.get()));
+    thread_local_computations_no_fusion.push_back(computation);
   }
 
+  TF_RETURN_IF_ERROR(AssignBuffersForComputations(
+      thread_local_computations_no_fusion,
+      /*is_thread_local=*/true,
+      /*buffers_to_assign_sequentially=*/nullptr, assignment.get()));
+
   // Mark all buffers which may be live out of the entry computation as
   // "liveout".
-  for (const LogicalBuffer* buffer :
-       assignment->liveness().maybe_live_out_buffers()) {
+  for (const HloBuffer* buffer :
+       assignment->alias_analysis().LiveOutBuffers()) {
     VLOG(3) << "maybe_live_out LogicalBuffer: " << *buffer;
     if (assignment->HasAllocation(*buffer)) {
       BufferAllocation* alloc =
@@ -1897,6 +1574,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   XLA_VLOG_LINES(2, assignment->ToString());
   TF_RETURN_IF_ERROR(assignment->ComputeSummaryStats());
   XLA_VLOG_LINES(1, assignment->GetStats().ToString());
+  VLOG(1) << "Buffer assignment done.";
   return std::move(assignment);
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 41adf1b80a5..ee56e826eaf 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -28,7 +28,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
@@ -152,8 +154,8 @@ class BufferAllocation {
 
   // Access to the logical buffers assigned to this allocation, and their
   // associated logical offsets and sizes.
-  const absl::flat_hash_map<const LogicalBuffer*, OffsetSize>&
-  assigned_buffers() const {
+  const absl::flat_hash_map<const BufferValue*, OffsetSize>& assigned_buffers()
+      const {
     return assigned_buffers_;
   }
 
@@ -206,7 +208,7 @@ class BufferAllocation {
   // GetSlice returns the Slice of contiguous memory that holds the value
   // described by the given 'buffer'.
   // REQUIRES: 'buffer' must be assigned to this allocation.
-  Slice GetSlice(const LogicalBuffer& buffer) const;
+  Slice GetSlice(const BufferValue& buffer) const;
 
   string ToString() const;
   BufferAllocationProto ToProto() const;
@@ -248,9 +250,9 @@ class BufferAllocation {
   // for this allocation. The point of peak memory usage is the point at which
   // the total size of all live logical buffers is maximal. If peak memory is
   // reached at multiple points, the set of logical buffers live at the earliest
-  // maximal point is returned. The vector is stabily sorted by
-  // LogicalBuffer::Index.
-  const std::vector<const LogicalBuffer*>& PeakMemoryLogicalBuffers() const {
+  // maximal point is returned. The vector is stably sorted by
+  // BufferValue::Index.
+  const std::vector<const BufferValue*>& PeakMemoryLogicalBuffers() const {
     return peak_buffers_;
   }
 
@@ -275,7 +277,7 @@ class BufferAllocation {
   friend class BufferAssignment;
 
   // Adds a LogicalBuffer to the set assigned to this buffer.
-  void AddAssignment(const LogicalBuffer& buffer, int64 offset, int64 size);
+  void AddAssignment(const BufferValue& buffer, int64 offset, int64 size);
 
   void set_entry_computation_parameter(int64 parameter_number,
                                        ShapeIndex param_shape_index,
@@ -333,13 +335,13 @@ class BufferAllocation {
 
   // Mapping from the set of buffers assigned to this allocation to their
   // logical offsets and sizes.
-  absl::flat_hash_map<const LogicalBuffer*, OffsetSize> assigned_buffers_;
+  absl::flat_hash_map<const BufferValue*, OffsetSize> assigned_buffers_;
 
   int64 fragmentation_bytes_ = 0;
   std::vector<HeapSimulatorTrace> heap_traces_;
 
   // Set of buffers live at the point of peak memory usage for this allocation.
-  std::vector<const LogicalBuffer*> peak_buffers_;
+  std::vector<const BufferValue*> peak_buffers_;
 };
 
 // Add stream operators for nicer output of CHECK/RET_CHECK failures.
@@ -361,12 +363,16 @@ class BufferAssignment {
   }
 
   // Returns whether the given buffer has been assigned an allocation.
-  bool HasAllocation(const LogicalBuffer& buffer) const;
+  bool HasAllocation(const BufferValue& value) const;
+
+  bool HasAllocation(const HloBuffer& buffer) const;
 
   // Returns the allocation that a particular LogicalBuffer has been assigned
   // to. CHECKs if buffer has not been assigned an allocation.
+  const BufferAllocation& GetAssignedAllocation(const BufferValue& value) const;
+
   const BufferAllocation& GetAssignedAllocation(
-      const LogicalBuffer& buffer) const;
+      const HloBuffer& hlo_buffer) const;
 
   // Returns the allocation with the given index. CHECKs if no allocation exists
   // with the given index.
@@ -405,11 +411,11 @@ class BufferAssignment {
   // computation).
   StatusOr<BufferAllocation::Slice> GetUniqueTopLevelOutputSlice() const;
 
-  // Returns the set LogicalBuffers which may be the source of the value at the
+  // Returns the set BufferValues which may be the source of the value at the
   // given index and instruction.
-  const PointsToSet::BufferList& GetSourceBuffers(
+  const std::vector<const HloValue*>& GetSourceBuffers(
       const HloInstruction* instruction, const ShapeIndex& index) const {
-    return GetPointsToSet(instruction).element(index);
+    return dataflow_analysis().GetValueSet(instruction, index).values();
   }
 
   // Returns true if 'hlo_a{shape_index_a}' and 'hlo_b{shape_index_b}'
@@ -439,6 +445,12 @@ class BufferAssignment {
     return liveness_->points_to_analysis();
   }
 
+  const HloDataflowAnalysis& dataflow_analysis() const {
+    return alias_analysis_->dataflow_analysis();
+  }
+
+  HloAliasAnalysis& alias_analysis() const { return *alias_analysis_; }
+
   // Returns the BufferLiveness object used to construct this assignment.
   const BufferLiveness& liveness() const { return *liveness_; }
 
@@ -472,12 +484,14 @@ class BufferAssignment {
 
   BufferAssignment(const HloModule* module,
                    std::unique_ptr<BufferLiveness> liveness,
-                   LogicalBuffer::SizeFunction buffer_size,
-                   LogicalBuffer::AlignmentFunction color_alignment)
+                   BufferValue::SizeFunction buffer_size,
+                   LogicalBuffer::AlignmentFunction color_alignment,
+                   std::unique_ptr<HloAliasAnalysis> alias_analysis)
       : module_(module),
         liveness_(std::move(liveness)),
         buffer_size_(std::move(buffer_size)),
-        color_alignment_(std::move(color_alignment)) {}
+        color_alignment_(std::move(color_alignment)),
+        alias_analysis_(std::move(alias_analysis)) {}
 
   // Creates and returns a new BufferAllocation, with no assigned
   // LogicalBuffers. Ownership is maintained internally.
@@ -485,10 +499,13 @@ class BufferAssignment {
 
   // Helper that calls NewEmptyAllocation and AddAssignment in one call,
   // creating an allocation containing a single LogicalBuffer.
-  BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size);
+  BufferAllocation* NewAllocation(const HloBuffer& buffer, int64 size);
 
   // Adds a LogicalBuffer to the set assigned to the given allocation.
-  void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer,
+  void AddAssignment(BufferAllocation* allocation, const HloBuffer& buffer,
+                     int64 offset, int64 size);
+
+  void AddAssignment(BufferAllocation* allocation, const BufferValue& value,
                      int64 offset, int64 size);
 
   // Returns the HloModule used to construct this assignment.
@@ -499,9 +516,17 @@ class BufferAssignment {
   const PointsToSet& GetPointsToSet(const HloInstruction* instruction) const;
 
   // Mutable accessors for allocations.
-  BufferAllocation* GetMutableAssignedAllocation(const LogicalBuffer& buffer);
+  BufferAllocation* GetMutableAssignedAllocation(const HloBuffer& buffer);
   BufferAllocation* GetMutableAllocation(BufferAllocation::Index index);
 
+  int64 HloBufferSize(const HloBuffer& buffer) {
+    int64 result = buffer_size_(*buffer.values()[0]);
+    for (const HloValue* value : buffer.values()) {
+      DCHECK_EQ(result, buffer_size_(*value));
+    }
+    return result;
+  }
+
   // Combines allocations of temporary buffers into one big BufferAllocation.
   void CombineTempAllocations();
 
@@ -515,18 +540,20 @@ class BufferAssignment {
   int64 temp_allocation_total_size_ = 0;
 
   // Maps Buffers to the index of the BufferAllocation which holds the buffer.
-  absl::flat_hash_map<const LogicalBuffer*, BufferAllocation::Index>
-      allocation_index_for_buffer_;
+  absl::flat_hash_map<const BufferValue*, BufferAllocation::Index>
+      allocation_index_for_value_;
 
   const HloModule* module_;
   const std::unique_ptr<BufferLiveness> liveness_;
 
   // Function which returns the buffer size for a given logical buffer (shape).
-  LogicalBuffer::SizeFunction buffer_size_;
+  BufferValue::SizeFunction buffer_size_;
 
   // Function which returns the alignment for a given logical buffer color.
   LogicalBuffer::AlignmentFunction color_alignment_;
 
+  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+
   Stats stats_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssignment);
@@ -535,61 +562,86 @@ class BufferAssignment {
 // A class which constructs a buffer assignment.
 class BufferAssigner {
  public:
-  // Returns false if a buffer cannot be assigned to given allocation.
-  using ReuseAllocationFunction = std::function<bool(
-      const BufferAssignment& assignment, const BufferAllocation& alloc,
-      const LogicalBuffer& buffer)>;
+  using Colorer = std::function<Status(HloAliasAnalysis*, const HloOrdering&)>;
 
-  // Returns whether a logical buffer can be considered reusing memory for
-  // colocated buffers.
-  using ReuseColocatedAllocationForTempChecker =
-      std::function<bool(const LogicalBuffer& buffer, int64 byte_size)>;
+  static Colorer DefaultColorer() {
+    return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
+      for (HloValue* value : alias_analysis->dataflow_analysis().values()) {
+        value->set_color(BufferValue::Color(0));
+      }
+      return Status::OK();
+    };
+  }
+
+  // Returns false if a buffer cannot be assigned to given allocation.
 
   // Build and return a BufferAssignment for the given module. The given
   // HloOrdering is used to determine buffer liveness. buffer_size and
   // color_alignment are functions which returns the size and alignment of a
-  // LogicalBuffer.  allow_input_output_aliasing specifies whether input buffer
-  // are allowed to be reused as outbut buffers by the client code.
+  // LogicalBuffer.
   static StatusOr<std::unique_ptr<BufferAssignment>> Run(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      LogicalBuffer::SizeFunction buffer_size,
+      BufferValue::SizeFunction buffer_size,
       LogicalBuffer::AlignmentFunction color_alignment,
-      bool allow_input_output_aliasing = false,
       bool allocate_buffers_for_constants = false,
-      BufferLiveness::Colorer colorer = BufferLiveness::DefaultColorer(),
-      ReuseAllocationFunction reuse_checker = nullptr,
-      ReuseColocatedAllocationForTempChecker reuse_colocated_checker = nullptr);
+      Colorer colorer = DefaultColorer(),
+      const absl::flat_hash_set<HloOpcode>& must_not_live_out = {},
+      HloDataflowAnalysis::FusionCanShareBufferFunction
+          fusion_can_share_buffer = nullptr);
 
  private:
-  BufferAssigner(bool allocate_buffers_for_constants,
-                 BufferLiveness::Colorer colorer,
-                 ReuseAllocationFunction reuse_checker,
-                 ReuseColocatedAllocationForTempChecker reuse_colocated_checker)
+  BufferAssigner(bool allocate_buffers_for_constants, Colorer colorer,
+                 const absl::flat_hash_set<HloOpcode>& must_not_live_out)
       : allocate_buffers_for_constants_(allocate_buffers_for_constants),
-        colorer_(std::move(colorer)),
-        reuse_checker_(std::move(reuse_checker)),
-        reuse_colocated_checker_(std::move(reuse_colocated_checker)) {}
+        colorer_(colorer),
+        must_not_live_out_(must_not_live_out) {}
   virtual ~BufferAssigner() = default;
 
   // Create a buffer assignment.
   StatusOr<std::unique_ptr<BufferAssignment>> CreateAssignment(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      LogicalBuffer::SizeFunction buffer_size,
-      LogicalBuffer::AlignmentFunction color_alignment);
+      BufferValue::SizeFunction buffer_size,
+      LogicalBuffer::AlignmentFunction color_alignment,
+      HloDataflowAnalysis::FusionCanShareBufferFunction
+          fusion_can_share_buffer);
 
-  // Assigns buffers to the instructions in the given computation. "assignment"
+  // Assigns buffers to the instructions in the given computations. "assignment"
   // is modified to reflect the new buffer assignments. If is_thread_local is
   // true, then all assigned buffers have the is_thread_local flag set to
   // true.
-  Status AssignBuffersForComputation(
-      const HloComputation* computation, bool is_thread_local,
-      const absl::flat_hash_set<const LogicalBuffer*>& colocated_buffers,
-      const absl::flat_hash_set<BufferAllocation::Index>& colocated_allocations,
+  Status AssignBuffersForComputations(
+      const std::vector<const HloComputation*>& computations,
+      bool is_thread_local,
       absl::flat_hash_map<const HloComputation*,
-                          absl::flat_hash_set<const LogicalBuffer*>>*
+                          absl::flat_hash_set<const BufferValue*>>*
           buffers_to_assign_sequentially,
       BufferAssignment* assignment);
 
+  // Converts a HloValueSet to LogicalBufferSet, this is needed for buffer
+  // assignment, which uses dataflow analysis, to talk to heap simulator that
+  // still uses tuple-points-to analysis.
+  BufferValueFlatSet HloValueSetToLogicalBufferSet(
+      const absl::flat_hash_set<const BufferValue*>& hlo_value_set,
+      const TuplePointsToAnalysis& points_to_analysis);
+
+  // Creates sets of buffer values that must be aliased with each other (e.g.,
+  // while init and loop body parameter).
+  std::vector<BufferValueFlatSet> BuildMustAliasLogicalBufferSet(
+      BufferAssignment* assignment);
+
+  // Promotes operations (DUS, scatter) to be done in place: If an operation can
+  // be done in place, merge its buffer with its operand buffer.
+  Status MergeInplaceOpBuffers(BufferAssignment* assignment);
+
+  // Assigns a single hlo buffer to an HLO allocation.
+  Status AssignSingleHloBuffer(
+      const HloBuffer* hlo_buffer, bool is_thread_local,
+      absl::flat_hash_map<const HloComputation*,
+                          absl::flat_hash_set<const BufferValue*>>*
+          buffers_to_assign_sequentially,
+      std::vector<BufferAllocation::Index>* allocation_indices,
+      BufferAssignment* assignment);
+
   // Assigns 'buffers_to_assign_sequentially' using heap simulation, assuming
   // the HLO instructions will be executed in the sequential order given by
   // assignment->liveness().hlo_ordering().SequentialOrder. If
@@ -597,7 +649,7 @@ class BufferAssigner {
   // assuming all global computations are sequentially ordered.
   Status AssignBuffersWithSequentialOrdering(
       const absl::flat_hash_map<const HloComputation*,
-                                absl::flat_hash_set<const LogicalBuffer*>>&
+                                absl::flat_hash_set<const BufferValue*>>&
           buffers_to_assign_sequentially,
       bool run_whole_module_heap_simulation, BufferAssignment* assignment);
 
@@ -609,64 +661,24 @@ class BufferAssigner {
 
   // Tries to assign the given instruction to the given buffer. Returns if the
   // assignment was successful.
-  bool MaybeAssignBuffer(BufferAllocation* allocation,
-                         const LogicalBuffer& buffer,
+  bool MaybeAssignBuffer(BufferAllocation* allocation, const HloBuffer& buffer,
                          BufferAssignment* assignment);
 
-  // Colocated buffers are logical buffers from different computations which
-  // alias. Explicitly handling these colocated buffers is necessary because
-  // points-to analysis is computation level scope and does not recognize
-  // aliasing across computations (b/32491382).
-  using ColocatedBufferSet = absl::flat_hash_set<const LogicalBuffer*>;
-
-  // Returns a vector of ColocatedBufferSet objects, where each
-  // ColocatedBufferSet aggregates a set of related LogicalBuffers from 'module'
-  // which should be colocated in the same buffer allocation.
-  void BuildColocatedBufferSets(
-      const HloModule* module, const BufferLiveness& buffer_liveness,
-      const LogicalBuffer::SizeFunction& buffer_size,
-      std::vector<ColocatedBufferSet>* colocated_buffer_sets);
-
-  // For each buffer set in 'colocated_buffer_sets', assigns all buffers in the
-  // same set to the same buffer allocation in 'assignment'.
-  void AssignColocatedBufferSets(
-      const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
-      BufferAssignment* assignment,
-      absl::flat_hash_set<const LogicalBuffer*>* colocated_buffers,
-      absl::flat_hash_set<BufferAllocation::Index>* colocated_allocations);
-
-  // Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
-  // the invariant that all sets in 'colocated_buffer_sets' are disjoint.
-  void AddSetToColocatedBufferSets(
-      const std::vector<const LogicalBuffer*>& colocated_set,
-      std::vector<ColocatedBufferSet>* colocated_buffer_sets);
-
-  // Given a list of colocated buffer sets (each colocated buffer set represents
-  // the logical buffers that would be assigned to the same physical buffer),
-  // try to merge the sets if the buffers can be shared. Returns the merged set.
-  std::vector<ColocatedBufferSet> MergeColocatedBufferSets(
-      const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
-      const BufferLiveness& buffer_liveness,
-      const LogicalBuffer::SizeFunction& buffer_size);
-
   // Split a set of buffers into several sets, each of which contains buffers
   // colored with the same color.
   absl::flat_hash_map<LogicalBuffer::Color,
-                      absl::flat_hash_set<const LogicalBuffer*>,
+                      absl::flat_hash_set<const BufferValue*>,
                       LogicalBuffer::Color::Hasher>
-  SplitBuffersByColor(const absl::flat_hash_set<const LogicalBuffer*>& buffers);
+  SplitBuffersByColor(const absl::flat_hash_set<const BufferValue*>& buffers);
 
   // If true, allocate buffers for constant instructions.
   bool allocate_buffers_for_constants_;
 
   // Functor used to assign colors to newly allocated logical buffers.
-  BufferLiveness::Colorer colorer_;
+  Colorer colorer_;
 
-  // Functor to check if a buffer can reuse an allocation.
-  ReuseAllocationFunction reuse_checker_;
-
-  // Functor to check if a temp buffer can reuse a colocated allocation.
-  ReuseColocatedAllocationForTempChecker reuse_colocated_checker_;
+  // A set of hlo opcodes that can't live out of a computation.
+  absl::flat_hash_set<HloOpcode> must_not_live_out_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
 };
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index acdf5d25e1d..8837e6d9344 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -92,7 +92,6 @@ class BufferAssignmentTest : public HloTestBase {
                module, absl::make_unique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
-               /*allow_input_output_aliasing=*/false,
                /*allocate_buffers_for_constants=*/true)
         .ConsumeValueOrDie();
   }
@@ -103,36 +102,30 @@ class BufferAssignmentTest : public HloTestBase {
                module, absl::make_unique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
-               /*allow_input_output_aliasing=*/false,
                /*allocate_buffers_for_constants=*/false)
         .ConsumeValueOrDie();
   }
 
   std::unique_ptr<BufferAssignment> RunBufferAssignmentNoBuffersReuseForAdd(
       HloModule* module, int64 alignment = 1) {
-    auto reuse_checker = [](const BufferAssignment& assignment,
-                            const BufferAllocation& alloc,
-                            const LogicalBuffer& buffer) {
-      return (buffer.instruction()->opcode() != HloOpcode::kAdd);
-    };
+    absl::flat_hash_set<HloOpcode> must_not_live_out = {HloOpcode::kAdd};
+
     return BufferAssigner::Run(
                module, absl::make_unique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
-               /*allow_input_output_aliasing=*/false,
                /*allocate_buffers_for_constants=*/false,
-               /*colorer=*/BufferLiveness::DefaultColorer(),
-               /*reuse_checker=*/reuse_checker)
+               /*colorer=*/BufferAssigner::DefaultColorer(),
+               /*must_not_live_out=*/must_not_live_out)
         .ConsumeValueOrDie();
   }
 
   std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
-      HloModule* module, BufferLiveness::Colorer colorer, int64 alignment = 1) {
+      HloModule* module, BufferAssigner::Colorer colorer, int64 alignment = 1) {
     return BufferAssigner::Run(
                module, absl::make_unique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
-               /*allow_input_output_aliasing=*/false,
                /*allocate_buffers_for_constants=*/true, std::move(colorer))
         .ConsumeValueOrDie();
   }
@@ -146,29 +139,10 @@ class BufferAssignmentTest : public HloTestBase {
                module, absl::make_unique<SequentialHloOrdering>(schedule),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; },
-               /*allow_input_output_aliasing=*/false,
                /*allocate_buffers_for_constants=*/true)
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<BufferAssignment>
-  RunBufferAssignmentWithReusingColocatedBuffersForTemp(HloModule* module,
-                                                        int64 alignment = 1) {
-    return BufferAssigner::Run(
-               module, absl::make_unique<DependencyHloOrdering>(module),
-               backend().compiler()->BufferSizeBytesFunction(),
-               [alignment](LogicalBuffer::Color) { return alignment; },
-               /*allow_input_output_aliasing=*/false,
-               /*allocate_buffers_for_constants=*/true,
-               /*colorer=*/BufferLiveness::DefaultColorer(),
-               /*reuse_checker=*/nullptr,
-               /*reuse_colocated_checker=*/
-               [](const LogicalBuffer& buffer, int64 byte_size) {
-                 return true;
-               })
-        .ConsumeValueOrDie();
-  }
-
   // Builds an x+1.0 computation to use in a Map.
   std::unique_ptr<HloComputation> BuildMapComputationPlus1(const string& name) {
     auto builder = HloComputation::Builder(name);
@@ -518,77 +492,8 @@ TEST_F(BufferAssignmentTest, AliasedParamCanBeReused) {
   EXPECT_EQ(neg_2_buffer.index(), neg_1_buffer.index());
 }
 
-TEST_F(BufferAssignmentTest, ReuseColocatedBuffersForTemp) {
-  const char* const hlo_string = R"(
-HloModule test
-
-sum (a: f32[], b: f32[]) -> f32[] {
-  a = f32[] parameter(0)
-  b = f32[] parameter(1)
-  ROOT add = f32[] add(a, b)
-}
-
-while_body {
-  state = (s32[], f32[1280,1,128]{2,1,0}) parameter(0)
-  get-tuple-element.4 = f32[1280,1,128]{2,1,0} get-tuple-element(state), index=1
-  get-tuple-element.3 = s32[] get-tuple-element(state), index=0
-  constant.2 = s32[] constant(128)
-  add.5 = s32[] add(get-tuple-element.3, constant.2)
-  broadcast = f32[2,1280,1,128]{3,2,1,0} broadcast(get-tuple-element.4), dimensions={1,2,3}
-  constant.3 = s32[] constant(0)
-  reduce = f32[1280,1,128]{2,1,0} reduce(broadcast, constant.3), dimensions={3}, to_apply=sum
-  ROOT tuple.85 = (s32[], f32[1280,1,128]{2,1,0}) tuple(add.5, reduce)
-}
-
-while_condition {
-  state = (s32[], f32[1280,1,128]{2,1,0}) parameter(0)
-  get-tuple-element = s32[] get-tuple-element(state), index=0
-  get-tuple-element.1 = s32[] constant(3)
-  ROOT less-than.339.338 = pred[] compare(get-tuple-element, get-tuple-element.1), direction=LT
-}
-
-sum.1 (a: f32[], b: f32[]) -> f32[] {
-  a = f32[] parameter(0)
-  b = f32[] parameter(1)
-  ROOT add = f32[] add(a, b)
-}
-
-ENTRY entry_computation {
-  parameter = f32[2,1280,1,128]{3,2,1,0} parameter(0)
-  constant.6 = f32[] constant(0)
-  reduce.1 = f32[1280,1,128]{2,1,0} reduce(parameter, constant.6), dimensions={3}, to_apply=sum.1
-  constant.7 = s32[] constant(0)
-  tuple.1 = (s32[], f32[1280,1,128]{2,1,0}) tuple(constant.7, reduce.1)
-  while.0 = (s32[], f32[1280,1,128]{2,1,0}) while(tuple.1), condition=while_condition, body=while_body
-  get-tuple-element.1 = f32[1280,1,128] get-tuple-element(while.0), index=1
-  ROOT broadcast.1 = f32[2,1280,1,128]{3,2,1,0} broadcast(get-tuple-element.1), dimensions={1,2,3}
-}
-
-)";
-  auto module_or_status =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
-  auto module = module_or_status.ConsumeValueOrDie();
-
-  TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias(
-      {}, 0, {}, HloInputOutputAliasConfig::kUserAlias));
-
-  auto assignment =
-      RunBufferAssignmentWithReusingColocatedBuffersForTemp(module.get());
-  // Get BufferAllocation for root instruction.
-  auto broadcast = FindInstruction(module.get(), "broadcast");
-  auto broadcast_alloc_slice =
-      assignment->GetUniqueTopLevelSlice(broadcast).ConsumeValueOrDie();
-  auto parameter = FindInstruction(module.get(), "parameter");
-  auto parameter_alloc_slice =
-      assignment->GetUniqueTopLevelSlice(parameter).ConsumeValueOrDie();
-
-  EXPECT_EQ(broadcast_alloc_slice.allocation(),
-            parameter_alloc_slice.allocation());
-  EXPECT_EQ(broadcast_alloc_slice, parameter_alloc_slice);
-}
-
 TEST_F(BufferAssignmentTest, AddCannotReuse) {
-  // Pass in a special rule to indicate that "add" cannot reuse any buffer.
+  // Pass in a special rule to indicate that "add" cannot be live out.
   //
   // paramscalar ------- (mul) -- (add) -- (sub)
   //                     /        /        /
@@ -625,13 +530,13 @@ TEST_F(BufferAssignmentTest, AddCannotReuse) {
   EXPECT_NE(param0_buffer.index(), param1_buffer.index());
 
   // The mul node has a valid buffer assigned, doesn't share with input.
-  const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
-  EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+  const BufferAllocation& sub_buffer = GetTopLevelAllocation(*buffers, sub);
+  EXPECT_NE(sub_buffer.index(), param0_buffer.index());
 
   // The add node cannot reuse the mul node's buffer since we told buffer
   // assignment so.
   const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
-  EXPECT_NE(add_buffer.index(), mul_buffer.index());
+  EXPECT_NE(add_buffer.index(), sub_buffer.index());
 
   // The sub node has a valid output buffer assigned.
   GetAssignedOutputAllocation(*buffers, sub);
@@ -663,14 +568,12 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto colorer = [](const BufferLiveness& buffer_liveness) {
+  auto colorer = [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
     int color = 0;
-
-    for (LogicalBuffer::Id id = 0;
-         id < buffer_liveness.points_to_analysis().num_logical_buffers();
-         id++) {
-      auto& buffer = buffer_liveness.points_to_analysis().logical_buffer(id);
-      buffer.set_color(LogicalBuffer::Color(color++));
+    for (HloValue::Id id = 0;
+         id < alias_analysis->dataflow_analysis().values().size(); id++) {
+      auto& value = alias_analysis->dataflow_analysis().GetValue(id);
+      value.set_color(BufferValue::Color(color++));
     }
     return Status::OK();
   };
@@ -724,21 +627,19 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto colorer = [](const BufferLiveness& buffer_liveness) {
-    for (LogicalBuffer::Id id = 0;
-         id < buffer_liveness.points_to_analysis().num_logical_buffers();
-         id++) {
-      auto& buffer = buffer_liveness.points_to_analysis().logical_buffer(id);
-      const auto& aliases =
-          buffer_liveness.points_to_analysis().GetBufferAliases(buffer);
-      for (const auto& alias : aliases) {
-        if (alias.instruction()->opcode() == HloOpcode::kAdd ||
-            alias.instruction()->opcode() == HloOpcode::kMultiply) {
-          buffer.set_color(LogicalBuffer::Color(1));
+  auto colorer = [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
+    for (HloValue::Id id = 0;
+         id < alias_analysis->dataflow_analysis().values().size(); id++) {
+      auto& value = alias_analysis->dataflow_analysis().GetValue(id);
+      auto& buffer = alias_analysis->GetBufferContainingValue(value);
+      for (const auto& alias : buffer.values()) {
+        if (alias->instruction()->opcode() == HloOpcode::kAdd ||
+            alias->instruction()->opcode() == HloOpcode::kMultiply) {
+          value.set_color(LogicalBuffer::Color(1));
         }
       }
-      if (!buffer.has_color()) {
-        buffer.set_color(LogicalBuffer::Color(0));
+      if (!value.has_color()) {
+        value.set_color(LogicalBuffer::Color(0));
       }
     }
     return Status::OK();
@@ -1734,7 +1635,7 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
 }
 
 TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
-  // paramscalar ------- (mul) -- (add) -- (sub)
+  // paramscalar -(bc)- (mul) -- (add) -- (sub)
   //                     /        /        /
   // param0[100] -------/        /        /
   //                            /        /
@@ -1752,7 +1653,7 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
       f32vec100_, HloOpcode::kMultiply, broadcast, param0));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
-  builder.AddInstruction(HloInstruction::CreateBinary(
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
@@ -1760,10 +1661,10 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
   auto buffers = RunBufferAssignment(module.get());
 
   const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
-  const std::vector<const LogicalBuffer*>& peak_buffers =
+  const std::vector<const BufferValue*>& peak_buffers =
       mul_buffer.PeakMemoryLogicalBuffers();
   ASSERT_EQ(peak_buffers.size(), 1);
-  EXPECT_EQ(peak_buffers[0]->instruction(), broadcast);
+  EXPECT_EQ(peak_buffers[0]->instruction(), sub);
 }
 
 TEST_F(BufferAssignmentTest, PeakBuffers) {
@@ -1807,81 +1708,18 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
   EXPECT_TRUE(buffer.IsPreallocatedTempBuffer());
   ASSERT_EQ(buffer.assigned_buffers().size(), 4);
 
-  const std::vector<const LogicalBuffer*>& peak_buffers =
+  const std::vector<const BufferValue*>& peak_buffers =
       buffer.PeakMemoryLogicalBuffers();
 
   // The peak live set should be concat and its inputs.
   ASSERT_EQ(peak_buffers.size(), 3);
   std::vector<const HloInstruction*> peak_instructions;
-  for (const LogicalBuffer* logical_buffer : peak_buffers) {
+  for (const BufferValue* logical_buffer : peak_buffers) {
     peak_instructions.push_back(logical_buffer->instruction());
   }
   EXPECT_THAT(peak_instructions, UnorderedElementsAre(rev, neg, concat));
 }
 
-TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
-  auto module = CreateNewVerifiedModule();
-  const Shape shape = ShapeUtil::MakeShape(F32, {123, 123});
-  HloComputation* condition;
-  {
-    auto b = HloComputation::Builder(TestName() + ".cond");
-    b.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
-    b.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
-    condition = module->AddEmbeddedComputation(b.Build());
-  }
-  HloComputation* body;
-  {
-    auto b = HloComputation::Builder(TestName() + ".body");
-    auto param =
-        b.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
-    b.AddInstruction(
-        HloInstruction::CreateUnary(shape, HloOpcode::kNegate, param));
-    body = module->AddEmbeddedComputation(b.Build());
-  }
-  auto builder = HloComputation::Builder(TestName());
-  auto param =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
-  auto copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, param));
-  auto while_op = builder.AddInstruction(
-      HloInstruction::CreateWhile(shape, condition, body, copy));
-  // This broadcast should get a temporary allocation which is merged with the
-  // allocation for the while. Peak buffers should include the while and the
-  // broadcast.
-  auto bcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(F32, {123, 123, 123}), while_op, {0, 1}));
-  builder.AddInstruction(HloInstruction::CreateReverse(
-      ShapeUtil::MakeShape(F32, {123, 123, 123}), bcast, {0}));
-  module->AddEntryComputation(builder.Build());
-
-  auto buffers = RunBufferAssignment(module.get());
-  const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, bcast);
-  const std::vector<const LogicalBuffer*>& peak_buffers =
-      buffer.PeakMemoryLogicalBuffers();
-  ASSERT_EQ(peak_buffers.size(), 2);
-
-  // The peak buffers should include the broadcast and one of the colocated
-  // buffers of the while (body param, condition param, body root, or the while
-  // itself).
-  const LogicalBuffer* bcast_buffer;
-  const LogicalBuffer* nonbcast_buffer;
-  if (peak_buffers[0]->instruction() == bcast) {
-    bcast_buffer = peak_buffers[0];
-    nonbcast_buffer = peak_buffers[1];
-  } else {
-    bcast_buffer = peak_buffers[1];
-    nonbcast_buffer = peak_buffers[0];
-  }
-  EXPECT_EQ(bcast_buffer->instruction(), bcast);
-  EXPECT_TRUE(
-      nonbcast_buffer->instruction() == copy ||
-      nonbcast_buffer->instruction() == while_op ||
-      nonbcast_buffer->instruction() == body->parameter_instruction(0) ||
-      nonbcast_buffer->instruction() == body->root_instruction() ||
-      nonbcast_buffer->instruction() == condition->parameter_instruction(0));
-}
-
 TEST_F(BufferAssignmentTest, ConstantBuffersAreNotReused) {
   const char* hlo_text = R"(
 HloModule Module
@@ -1980,7 +1818,6 @@ class WhileBufferAssignmentTest : public HloTestBase {
                module, absl::make_unique<SequentialHloOrdering>(schedule),
                ByteSizeOf,
                [alignment](LogicalBuffer::Color) { return alignment; },
-               /*allow_input_output_aliasing=*/false,
                /*allocate_buffers_for_constants=*/true)
         .ConsumeValueOrDie();
   }
@@ -2300,7 +2137,6 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
           module.get(), absl::make_unique<SequentialHloOrdering>(schedule),
           backend().compiler()->BufferSizeBytesFunction(),
           [](LogicalBuffer::Color) { return 1; },
-          /*allow_input_output_aliasing=*/false,
           /*allocate_buffers_for_constants=*/true));
 
   // The result tuple elements must be assigned with different buffers.
@@ -2533,7 +2369,6 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
       BufferAssigner::Run(
           module.get(), absl::make_unique<SequentialHloOrdering>(schedule),
           ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
-          /*allow_input_output_aliasing=*/false,
           /*allocate_buffers_for_constants=*/true)
           .ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index 301ac9cc3d4..f6dac508e5f 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -46,7 +46,7 @@ StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
   CHECK_EQ(conditional->opcode(), HloOpcode::kConditional);
   // Do not remove conditionals that contain side-effecting instructions or
   // have control predecessors/successors in either true/false computation.
-  if (!conditional->parent()->IsRemovable(conditional) ||
+  if (!conditional->parent()->IsSafelyRemovable(conditional) ||
       conditional->HasSideEffect()) {
     VLOG(2) << "Not attempting to remove conditional as it is not removable or "
                "has side effect: "
@@ -188,7 +188,7 @@ StatusOr<bool> ConditionalSimplifier::Run(HloModule* module) {
   // instructions as we iterate.
   std::vector<HloInstruction*> conditional_ops;
   for (auto* comp : module->computations()) {
-    for (auto* instr : comp->instructions()) {
+    for (auto* instr : comp->MakeInstructionPostOrder()) {
       if (instr->opcode() == HloOpcode::kConditional) {
         conditional_ops.push_back(instr);
       }
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index 9759526c6e0..a584aba816f 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -212,6 +212,65 @@ ENTRY main {
                 .size(),
             2);
 }
+
+TEST_F(ConditionalSimplifierTest,
+       TwoConditionalsCreatedInReversedLexicalOrder) {
+  absl::string_view hlo_string = R"(
+  HloModule DeadConditional
+    computation.1 {
+      param.1 = s64[] parameter(0)
+      constant.1 = s64[] constant(1)
+      ROOT add.1 = s64[] add(param.1, constant.1)
+    }
+
+    computation.2 {
+      param.2 = s64[] parameter(0)
+      constant.2 = s64[] constant(2)
+      ROOT add.2 = s64[] add(param.2, constant.2)
+   }
+
+    computation.3 {
+      param.3 = s64[] parameter(0)
+      constant.3 = s64[] constant(3)
+      ROOT add.3 = s64[] add(param.3, constant.3)
+    }
+
+    computation.4 {
+      param.4 = s64[] parameter(0)
+      constant.4 = s64[] constant(4)
+      ROOT add.4 = s64[] add(param.4, constant.4)
+    }
+
+    ENTRY KernelEntry {
+      param.1 = s64[] parameter(0)
+      param.2 = s64[] parameter(1)
+      param.3 = s64[] parameter(2)
+      param.4 = pred[] parameter(3)
+
+      conditional_1 = s64[] conditional(param.4, param.3, param.2),
+        true_computation=computation.3, false_computation=computation.4
+      constant.1 = pred[] constant(false)
+      ROOT conditional_2 = s64[] conditional(constant.1, conditional_1,
+        param.1), true_computation=computation.1,
+        false_computation=computation.2
+    })";
+  auto status = ParseHloString(hlo_string);
+  TF_ASSERT_OK(status.status());
+  std::unique_ptr<HloModule> module = status.ConsumeValueOrDie();
+  HloVerifier v(false, false);
+  TF_ASSERT_OK(v.Run(module.get()).status());
+
+  // Replace conditional_1 with a clone that is created after conditional_2.
+  HloInstruction* conditional_1 =
+      FindInstruction(module.get(), "conditional_1");
+  HloInstruction* conditional_1_clone =
+      conditional_1->parent()->AddInstruction(conditional_1->Clone());
+  TF_ASSERT_OK(conditional_1->ReplaceAllUsesWith(conditional_1_clone));
+  TF_ASSERT_OK(conditional_1->parent()->RemoveInstruction(conditional_1));
+
+  EXPECT_TRUE(ConditionalSimplifier().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index f7e19970feb..988b93b557f 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -94,12 +94,12 @@ class CopyInsertion : public HloModulePass {
   virtual Status AddSpecialCaseCopies(const CallGraph& call_graph,
                                       HloModule* module);
 
- private:
-  Status AddCopiesToResolveInterference(HloModule* module);
-
   // Backend specific function that decides whether a fusion can share buffer
   // with its operand.
   HloDataflowAnalysis::FusionCanShareBufferFunction fusion_can_share_buffer_;
+
+ private:
+  Status AddCopiesToResolveInterference(HloModule* module);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 09f5c859af4..227d8ffb1a0 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -9,10 +9,9 @@ load(
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load(":build_defs.bzl", "runtime_copts")
 
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 package_group(
@@ -905,6 +904,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 06ea1e2f8bd..a3e224824ba 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -297,6 +297,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                           /*allow_mixed_precision=*/false);
 
+    pass.AddPass<ScatterExpander>();
     pass.AddPass<BatchNormExpander>(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
@@ -340,8 +341,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 
   pipeline.AddPass<CpuInstructionFusion>();
 
-  pipeline.AddPass<ScatterExpander>();
-
   ReducePrecisionInsertion::AddPasses(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
@@ -658,7 +657,6 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       BufferAssigner::Run(module.get(),
                           absl::make_unique<SequentialHloOrdering>(schedule),
                           BufferSizeBytesFunction(), memory_alignment,
-                          /*allow_input_output_aliasing=*/false,
                           /*allocate_buffers_for_constants=*/true));
   DumpHloModuleIfEnabled(*module, *assignment, "after_optimizations");
 
@@ -851,7 +849,6 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         BufferAssigner::Run(module,
                             absl::make_unique<SequentialHloOrdering>(schedule),
                             BufferSizeBytesFunction(), memory_alignment,
-                            /*allow_input_output_aliasing=*/false,
                             /*allocate_buffers_for_constants=*/true));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.cc b/tensorflow/compiler/xla/service/cpu/disassembler.cc
index c3c6847b7b7..e95f29fc889 100644
--- a/tensorflow/compiler/xla/service/cpu/disassembler.cc
+++ b/tensorflow/compiler/xla/service/cpu/disassembler.cc
@@ -89,13 +89,14 @@ StatusOr<DisassemblerResult> Disassembler::DisassembleObjectFile(
     });
 
     // Construct ArrayRef pointing to section contents.
-    llvm::StringRef section_content_string;
-    if (section.getContents(section_content_string)) {
+    llvm::Expected<llvm::StringRef> section_content_string =
+        section.getContents();
+    if (!section_content_string) {
       continue;
     }
     llvm::ArrayRef<uint8_t> section_content_bytes(
-        reinterpret_cast<const uint8*>(section_content_string.data()),
-        section_content_string.size());
+        reinterpret_cast<const uint8*>(section_content_string->data()),
+        section_content_string->size());
 
     // Use int types from LLVM (eg, uint64_t) for values passed to and returned
     // from the LLVM API. These values map to different types in LLVM and
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 234fa91fe3e..23312e40f7e 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 
 namespace xla {
 namespace cpu {
@@ -74,8 +75,9 @@ class DefaultCostModel : public ParallelCostModel {
       // Limit max parallelism for I/O bound instructions by assuming a
       // sub-linear scaling function (fit based on empirical benchmark results).
       // TODO(b/29630486) Develop system bandwidth model.
-      max_parallelism =
-          std::ceil(std::sqrt(tensorflow::port::NumSchedulableCPUs()));
+      max_parallelism = std::min<int64>(
+          max_parallelism_,
+          std::ceil(std::sqrt(tensorflow::port::NumSchedulableCPUs())));
       // Use shape size instruction cost and L2 cache size min per-thread cost.
       instruction_cost = shape_size_(instruction->shape());
       min_cost_per_thread = 256LL << 10;  // 256KB L2 Cache size.
@@ -134,6 +136,10 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   // *) Emit custom loops (kSelectAndScatter).
   // *) Operations that are not thread safe (like infeed and rng).
   // *) Tuple-shaped.
+  // *) Operations that might be implemented as an in-place
+  //    dynamic-update-slice, because we can't know how many output elements
+  //    they will write (out-of-place will touch the whole output buffer, while
+  //    in-place will only touch the updated elements).
   // TODO(b/27458679) Parallelize instructions which are skipped here.
   auto opcode = instruction->opcode();
   if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
@@ -147,6 +153,7 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
        PotentiallyImplementedAsEigenConvolution(*instruction,
                                                 target_machine_features_)) ||
       (opcode == HloOpcode::kFusion && !instruction->IsLoopFusion()) ||
+      llvm_ir::MayBeImplementedAsInPlaceDynamicUpdateSlice(instruction) ||
       instruction->shape().IsTuple()) {
     return 1;
   }
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index 35ae62b42df..e2c93568b74 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -125,5 +125,50 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ParallelTaskAssignmentTest, InPlaceDynamicUpdateSliceNotParallelized) {
+  // A dynamic-update-slice within a while loop.  This construction is an easy
+  // way to make a DUS which can be run "in-place" (i.e. the input and output
+  // are the same buffer, and running the DUS only writes to the updated
+  // elements).
+  const string hlo_string = R"(
+  HloModule test
+
+  body {
+    zero = s32[] constant(0)
+    one = s32[] constant(1)
+    ten = s32[] constant(10)
+    loop_carry = (s32[], u32[1,100], u32[10000,100]) parameter(0)
+    i = s32[] get-tuple-element(loop_carry), index=0
+    i_plus_ten = s32[] add(i, ten)
+    update = u32[1,100] get-tuple-element(loop_carry), index=1
+    data = u32[10000,100] get-tuple-element(loop_carry), index=2
+    new_data = u32[10000,100] dynamic-update-slice(data, update, i_plus_ten, zero)
+    new_i = s32[] add(i, one)
+    ROOT tuple = (s32[], u32[1,100], u32[10000,100]) tuple(new_i, update, new_data)
+  }
+
+  cond {
+    loop_carry = (s32[], u32[1,100], u32[10000,100]) parameter(0)
+    two = s32[] constant(2)
+    i = s32[] get-tuple-element(loop_carry), index=0
+    ROOT less-than = pred[] compare(i, two), direction=LT
+  }
+
+  ENTRY test {
+    zero = s32[] constant(0)
+    initial_i = s32[] parameter(0)
+    update = u32[1,100] parameter(1)
+    data = u32[10000,100] parameter(2)
+    tuple = (s32[], u32[1,100], u32[10000,100]) tuple(initial_i, update, data)
+    ROOT while = (s32[], u32[1,100], u32[10000,100]) while(tuple), condition=cond, body=body
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 382dfd0d99d..1fa2c56abd0 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -1,10 +1,9 @@
 # Description:
 #    Tests for LLVM-based CPU backend for XLA.
 
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 package_group(
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index b2563f9949e..1a31f5471de 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -222,54 +222,84 @@ Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
 }
 
 Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
-  return ForEachOperandDynamicDimension(
-      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
-               int64 operand_index, HloInstruction* dynamic_size) {
-        HloInstruction* dot = hlo;
-        const DotDimensionNumbers& dimension_numbers =
-            dot->dot_dimension_numbers();
-        // A map from the operand dimensions to result dimension.
-        absl::flat_hash_map<int64, int64> result_dim_mapping;
-        int64 current_result_dims = 0;
-        std::unordered_set<int64> batch_dims(
-            dimension_numbers.rhs_batch_dimensions().begin(),
-            dimension_numbers.rhs_batch_dimensions().end());
+  return ForEachOperandDynamicDimension(hlo, [&](HloInstruction* operand,
+                                                 ShapeIndex operand_shape_index,
+                                                 int64 operand_dimension,
+                                                 int64 operand_index,
+                                                 HloInstruction* dynamic_size) {
+    // There are three types of dimensions in a dot:
+    // A. batch dims
+    // B. contracting dims
+    // C. non-batch non-contracting dims.
+    // The output dimemsions of a dot has three parts with the following order:
+    // [(type A), (lhs type C), (rhs type C)]
+    //
+    // Note that both lhs and rhs have the same dimension sizes for batch,
+    // but the dimension index could be different.
+    //
+    // Given one dynamic input dimension, either lhs or rhs, we use a
+    // mapping to find the corresponding output dimension.
+    HloInstruction* dot = hlo;
+    const DotDimensionNumbers& dimension_numbers = dot->dot_dimension_numbers();
+    // A map from the operand dimensions to result dimension.
+    absl::flat_hash_map<int64, int64> result_dim_mapping;
+    int64 current_result_dims = 0;
 
-        for (int64 i : dimension_numbers.rhs_batch_dimensions()) {
-          result_dim_mapping[i] = current_result_dims++;
-        }
+    bool lhs = operand_index == 0;
 
-        for (int64 i = 0; i < dot->operand(0)->shape().rank(); i++) {
-          if (!absl::c_linear_search(
-                  dimension_numbers.lhs_contracting_dimensions(), i)) {
-            if (operand_index == 0) {
-              result_dim_mapping[i] = current_result_dims;
-            }
-            current_result_dims++;
-          }
-        }
+    // The first loop keep tracks of batch dimension. RHS and LHS could have
+    // diffrent batch dimension numbers.
+    if (lhs) {
+      for (int64 i : dimension_numbers.lhs_batch_dimensions()) {
+        result_dim_mapping[i] = current_result_dims++;
+      }
+    } else {
+      for (int64 i : dimension_numbers.rhs_batch_dimensions()) {
+        result_dim_mapping[i] = current_result_dims++;
+      }
+    }
 
-        for (int64 i = 0; i < dot->operand(1)->shape().rank(); i++) {
-          if (!absl::c_linear_search(
-                  dimension_numbers.rhs_contracting_dimensions(), i) &&
-              !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(),
-                                     i)) {
-            if (operand_index == 1) {
-              result_dim_mapping[i] = current_result_dims;
-            }
-            current_result_dims++;
-          }
-        }
+    // Handle dimensions in the lhs.
+    for (int64 i = 0; i < dot->operand(0)->shape().rank(); i++) {
+      // Look for non-contracting and non-batching dimension.
+      if (absl::c_linear_search(dimension_numbers.lhs_contracting_dimensions(),
+                                i)) {
+        continue;
+      }
+      if (absl::c_linear_search(dimension_numbers.lhs_batch_dimensions(), i)) {
+        continue;
+      }
+      if (lhs) {
+        result_dim_mapping[i] = current_result_dims;
+      }
+      current_result_dims++;
+    }
 
-        // Check if the operand dim is in the result shape. If so, add another
-        // work item to trace that dimension.
-        auto iter = result_dim_mapping.find(dimension);
-        if (iter != result_dim_mapping.end()) {
-          parent_->SetDynamicSize(dot, {}, iter->second, dynamic_size);
-        }
+    // Handle dimensions in the rhs.
+    for (int64 i = 0; i < dot->operand(1)->shape().rank(); i++) {
+      // Look for non-contracting and non-batching dimension.
+      if (absl::c_linear_search(dimension_numbers.rhs_contracting_dimensions(),
+                                i)) {
+        continue;
+      }
+      if (absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(), i)) {
+        continue;
+      }
+      if (!lhs) {
+        result_dim_mapping[i] = current_result_dims;
+      }
+      current_result_dims++;
+    }
 
-        return Status::OK();
-      });
+    // Check if the operand dim is in the result shape. If so, add another
+    // work item to trace that dimension.
+    auto iter = result_dim_mapping.find(operand_dimension);
+    if (iter != result_dim_mapping.end()) {
+      parent_->SetDynamicSize(dot, {}, iter->second, dynamic_size);
+    }
+
+    return Status::OK();
+  });
 }
 
 Status DynamicDimensionInferenceVisitor::HandleTranspose(HloInstruction* hlo) {
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index a18c0176153..335aff662ec 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -344,6 +344,45 @@ TEST_F(DynamicDimensionInferenceTest, DotTest) {
   EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 1), nullptr);
 }
 
+TEST_F(DynamicDimensionInferenceTest, DotTestBatch) {
+  auto builder = HloComputation::Builder(TestName());
+  auto lhs_shape = ShapeUtil::MakeShape(F32, {4, 128, 2, 8});
+  auto rhs_shape = ShapeUtil::MakeShape(F32, {4, 128, 2, 8});
+  auto output_shape = ShapeUtil::MakeShape(F32, {4, 2, 128, 128});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, lhs_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, rhs_shape, "B"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_rhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(2);
+  dot_dnums.add_rhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(2);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(output_shape, a_param, b_param, dot_dnums,
+                                HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for batch dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  SCOPED_TRACE(module_->ToString());
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 0), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 1), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 2), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 3), nullptr);
+}
+
 TEST_F(DynamicDimensionInferenceTest, ConvolutionTest) {
   auto builder = HloComputation::Builder(TestName());
   constexpr int xdim = 3;
diff --git a/tensorflow/compiler/xla/service/dynamic_update_slice_test.cc b/tensorflow/compiler/xla/service/dynamic_update_slice_test.cc
new file mode 100644
index 00000000000..a7caab685bf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_update_slice_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class DynamicUpdateSliceTest : public HloTestBase {};
+
+XLA_TEST_F(DynamicUpdateSliceTest, ShardedInPlaceDUS) {
+  // A dynamic-update-slice within a while loop.  This construction is an easy
+  // way to make a DUS which can be run "in-place" (i.e. the input and output
+  // are the same buffer, and running the DUS only writes to the updated
+  // elements).
+  const char kModuleStr[] = R"(
+  HloModule test
+
+  body {
+    zero = s32[] constant(0)
+    one = s32[] constant(1)
+    ten = s32[] constant(10)
+    loop_carry = (s32[], u32[1,100], u32[10000,100]) parameter(0)
+    i = s32[] get-tuple-element(loop_carry), index=0
+    i_plus_ten = s32[] add(i, ten)
+    update = u32[1,100] get-tuple-element(loop_carry), index=1
+    data = u32[10000,100] get-tuple-element(loop_carry), index=2
+    new_data = u32[10000,100] dynamic-update-slice(data, update, i_plus_ten, zero)
+    new_i = s32[] add(i, one)
+    ROOT tuple = (s32[], u32[1,100], u32[10000,100]) tuple(new_i, update, new_data)
+  }
+
+  cond {
+    loop_carry = (s32[], u32[1,100], u32[10000,100]) parameter(0)
+    two = s32[] constant(2)
+    i = s32[] get-tuple-element(loop_carry), index=0
+    ROOT less-than = pred[] compare(i, two), direction=LT
+  }
+
+  ENTRY test {
+    zero = s32[] constant(0)
+    initial_i = s32[] parameter(0)
+    update = u32[1,100] parameter(1)
+    data = u32[10000,100] parameter(2)
+    tuple = (s32[], u32[1,100], u32[10000,100]) tuple(initial_i, update, data)
+    ROOT while = (s32[], u32[1,100], u32[10000,100]) while(tuple), condition=cond, body=body
+  }
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  TF_ASSERT_OK_AND_ASSIGN(auto fake_arguments, MakeFakeArguments(module.get()));
+  fake_arguments[0] = LiteralUtil::CreateR0<int32>(0);
+
+  std::vector<Literal*> fake_argument_ptrs;
+  absl::c_transform(
+      fake_arguments, std::back_inserter(fake_argument_ptrs),
+      [](const Literal& literal) { return &const_cast<Literal&>(literal); });
+
+  ErrorSpec no_error(0, 0);
+  EXPECT_TRUE(RunAndCompare(std::move(module), fake_argument_ptrs, no_error));
+}
+
+// Regression test for a dynamic-update-slice involved in the expansion of a
+// kScatter op.  Apologies for the large testcase, this proved difficult to
+// reduce.  The bug we're checking for occurs when the dynamic-update-slice is
+// run in place but is sharded across cores by ParallelTaskAssigner.
+XLA_TEST_F(DynamicUpdateSliceTest, ExpandedScatter) {
+  const char kModuleStr[] = R"(
+HloModule TensorFlowScatter
+
+and.reduce_sub_computation {
+  lhs = pred[] parameter(0)
+  rhs = pred[] parameter(1)
+  ROOT and = pred[] and(lhs, rhs)
+}
+
+while_body {
+  param.1 = (s32[], f32[8,3,96,1,64]{4,3,2,1,0}, s32[16,4]{1,0}, f32[16,64]{1,0}) parameter(0)
+  get-tuple-element.1 = s32[] get-tuple-element(param.1), index=0
+  constant.4 = s32[] constant(1)
+  add = s32[] add(get-tuple-element.1, constant.4)
+  get-tuple-element.2 = f32[8,3,96,1,64]{4,3,2,1,0} get-tuple-element(param.1), index=1
+  constant.8 = s32[] constant(0)
+  broadcast.1 = s32[5]{0} broadcast(constant.8), dimensions={}
+  get-tuple-element.3 = s32[16,4]{1,0} get-tuple-element(param.1), index=2
+  constant.5 = s32[] constant(0)
+  dynamic-slice = s32[1,4]{1,0} dynamic-slice(get-tuple-element.3, get-tuple-element.1, constant.5), dynamic_slice_sizes={1,4}
+  slice.18 = s32[1,1]{1,0} slice(dynamic-slice), slice={[0:1], [0:1]}
+  reshape.23 = s32[1]{0} reshape(slice.18)
+  reshape.4 = s32[4]{0} reshape(dynamic-slice)
+  slice.19 = s32[3]{0} slice(reshape.4), slice={[1:4]}
+  constant.6 = s32[1]{0} constant({0})
+  concatenate.1 = s32[5]{0} concatenate(reshape.23, slice.19, constant.6), dimensions={0}
+  compare.1 = pred[5]{0} compare(broadcast.1, concatenate.1), direction=LE
+  constant.9 = s32[5]{0} constant({7, 2, 95, 0, 0})
+  compare.2 = pred[5]{0} compare(constant.9, concatenate.1), direction=GE
+  and.1 = pred[5]{0} and(compare.1, compare.2)
+  constant.10 = pred[] constant(true)
+  reduce = pred[] reduce(and.1, constant.10), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2 = pred[1,1,1,1,64]{4,3,2,1,0} broadcast(reduce), dimensions={}
+  reshape.24 = s32[] reshape(slice.18)
+  slice.26 = s32[1]{0} slice(reshape.4), slice={[1:2]}
+  reshape.10 = s32[] reshape(slice.26)
+  slice.27 = s32[1]{0} slice(reshape.4), slice={[2:3]}
+  reshape.11 = s32[] reshape(slice.27)
+  slice.28 = s32[1]{0} slice(reshape.4), slice={[3:4]}
+  reshape.12 = s32[] reshape(slice.28)
+  reshape.13 = s32[] reshape(constant.6)
+  dynamic-slice.2 = f32[1,1,1,1,64]{4,3,2,1,0} dynamic-slice(get-tuple-element.2, reshape.24, reshape.10, reshape.11, reshape.12, reshape.13), dynamic_slice_sizes={1,1,1,1,64}
+  get-tuple-element.4 = f32[16,64]{1,0} get-tuple-element(param.1), index=3
+  constant.7 = s32[] constant(0)
+  dynamic-slice.1 = f32[1,64]{1,0} dynamic-slice(get-tuple-element.4, get-tuple-element.1, constant.7), dynamic_slice_sizes={1,64}
+  reshape.28 = f32[1,1,1,1,64]{4,3,2,1,0} reshape(dynamic-slice.1)
+  add.1 = f32[1,1,1,1,64]{4,3,2,1,0} add(dynamic-slice.2, reshape.28)
+  select = f32[1,1,1,1,64]{4,3,2,1,0} select(broadcast.2, add.1, dynamic-slice.2)
+  reshape.29 = s32[] reshape(slice.18)
+  slice.29 = s32[1]{0} slice(reshape.4), slice={[1:2]}
+  reshape.15 = s32[] reshape(slice.29)
+  slice.30 = s32[1]{0} slice(reshape.4), slice={[2:3]}
+  reshape.16 = s32[] reshape(slice.30)
+  slice.31 = s32[1]{0} slice(reshape.4), slice={[3:4]}
+  reshape.17 = s32[] reshape(slice.31)
+  reshape.18 = s32[] reshape(constant.6)
+  dynamic-update-slice = f32[8,3,96,1,64]{4,3,2,1,0} dynamic-update-slice(get-tuple-element.2, select, reshape.29, reshape.15, reshape.16, reshape.17, reshape.18)
+  ROOT tuple.1 = (s32[], f32[8,3,96,1,64]{4,3,2,1,0}, s32[16,4]{1,0}, f32[16,64]{1,0}) tuple(add, dynamic-update-slice, get-tuple-element.3, get-tuple-element.4)
+}
+
+while_cond {
+  param.0 = (s32[], f32[8,3,96,1,64]{4,3,2,1,0}, s32[16,4]{1,0}, f32[16,64]{1,0}) parameter(0)
+  get-tuple-element = s32[] get-tuple-element(param.0), index=0
+  constant.2 = s32[] constant(16)
+  ROOT compare = pred[] compare(get-tuple-element, constant.2), direction=LT
+}
+
+ENTRY main {
+  constant = s32[] constant(0)
+  z = f32[] constant(0)
+  b = f32[8,3,96,1,64]{4,3,2,1,0} broadcast(z), dimensions={}
+  i = s32[8,2,4]{2,1,0} parameter(0)
+  reshape = s32[16,4]{1,0} reshape(i)
+  u = f32[8,2,64]{2,1,0} parameter(1)
+  reshape.1 = f32[16,64]{1,0} reshape(u)
+  tuple = (s32[], f32[8,3,96,1,64]{4,3,2,1,0}, s32[16,4]{1,0}, f32[16,64]{1,0}) tuple(constant, b, reshape, reshape.1)
+  while = (s32[], f32[8,3,96,1,64]{4,3,2,1,0}, s32[16,4]{1,0}, f32[16,64]{1,0}) while(tuple), condition=while_cond, body=while_body
+  ROOT get-tuple-element.5 = f32[8,3,96,1,64]{4,3,2,1,0} get-tuple-element(while), index=1
+}
+)";
+
+  Literal updates =
+      Literal::CreateFromShape(ShapeUtil::MakeShape(F32, {8, 2, 64}));
+  updates.PopulateWithValue(1.0f);
+
+  Literal indices =
+      Literal::CreateFromShape(ShapeUtil::MakeShape(S32, {8, 2, 4}));
+  indices
+      .Populate<int>([&](absl::Span<const int64> indices) -> int {
+        auto i = indices[2] + indices[1] * 4 + indices[0] * 2 * 4;
+        switch (indices[2]) {
+          case 0:
+            return i % 8;
+          case 1:
+            return i % 3;
+          case 2:
+            return i % 96;
+          default:
+            return 0;
+        }
+      })
+      .IgnoreError();
+
+  ErrorSpec no_error(0, 0);
+  EXPECT_TRUE(
+      RunAndCompare(ParseAndReturnVerifiedModule(kModuleStr).ValueOrDie(),
+                    {&indices, &updates}, no_error));
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index d6a7ec90b59..efa44b2a88d 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -45,14 +45,18 @@ Status GenericTransferManager::WriteSingleTupleIndexTable(
     const Shape& shape, se::DeviceMemoryBase* region) {
   TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape));
 
-  std::vector<const void*> element_pointers;
+  auto element_pointers = std::make_shared<std::vector<const void*>>();
+  element_pointers->reserve(elements.size());
   for (const se::DeviceMemoryBase& element : elements) {
-    element_pointers.push_back(element.opaque());
+    element_pointers->push_back(element.opaque());
   }
   TF_RETURN_IF_ERROR(TransferBufferToDevice(
-      stream, GetByteSizeRequirement(shape), element_pointers.data(), region));
+      stream, GetByteSizeRequirement(shape), element_pointers->data(), region));
   // Ensure the buffer is transferred before we destroy element_pointers.
-  return stream->BlockHostUntilDone();
+  stream->ThenDoHostCallback([element_pointers]() {
+    /* holds reference to element_pointers in closure */
+  });
+  return Status::OK();
 }
 
 void GenericTransferManager::TransferLiteralFromDevice(
@@ -115,7 +119,7 @@ Status GenericTransferManager::TransferLiteralToDeviceAsync(
   TF_RET_CHECK(stream->parent()->device_ordinal() ==
                device_buffer.device_ordinal());
 
-  TF_RETURN_IF_ERROR(WriteTupleIndexTables(stream, device_buffer));
+  TF_RETURN_IF_ERROR(WriteTupleIndexTablesAsync(stream, device_buffer));
 
   return ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_host_shape(),
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 2ffc6c8fb63..7a4c5ffc742 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -10,9 +10,10 @@ load(
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = [":friends"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
@@ -333,6 +334,7 @@ cc_library(
     deps = [
         ":buffer_allocations",
         ":hlo_execution_profiler",
+        "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -347,6 +349,7 @@ tf_cuda_library(
         ":buffer_allocations",
         ":hlo_execution_profiler",
         ":thunk",
+        "//tensorflow/compiler/xla:refcounting_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/synchronization",
         "//tensorflow/compiler/xla:util",
@@ -451,6 +454,7 @@ cc_library(
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:kernel",
         "//tensorflow/stream_executor/cuda:cuda_stream",
+        "//tensorflow/stream_executor/cuda:curand_plugin",
         "//tensorflow/stream_executor/gpu:gpu_stream",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -547,6 +551,7 @@ cc_library(
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/cuda:ptxas_utils",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
     ],
@@ -635,6 +640,9 @@ cc_library(
     srcs = ["cusolver_context.cc"],
     hdrs = ["cusolver_context.h"],
     deps = [
+        # LINT.IfChange
+        "@local_config_cuda//cuda:cublas_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -978,6 +986,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/cuda:cuda_diagnostics",
+        "//tensorflow/stream_executor/cuda:ptxas_utils",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1158,6 +1167,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:kernel_spec",
+        "//tensorflow/stream_executor/cuda:ptxas_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
index 5f3b3b48ef2..31e3eadd69f 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -363,10 +363,10 @@ static StatusOr<bool> DeviceCompare(se::Stream* stream,
   se::DeviceMemory<ElementT> rhs_typed(rhs);
   uint64 buffer_size = lhs_typed.ElementCount();
 
-  PtxCompilationOptions opts(config);
-  TF_ASSIGN_OR_RETURN(
-      absl::Span<const uint8> compiled_ptx,
-      CompilePtxOrGetCached(executor, buffer_compare_ptx, opts));
+  TF_ASSIGN_OR_RETURN(absl::Span<const uint8> compiled_ptx,
+                      se::cuda::CompilePtxOrGetCached(
+                          executor->device_ordinal(), buffer_compare_ptx,
+                          PtxOptsFromConfig(config)));
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ComparisonKernelT<ElementT>> comparison_kernel,
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
index 7daef16cb62..84970a71ac3 100644
--- a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
@@ -52,7 +52,7 @@ CholeskyThunk::CholeskyThunk(const CholeskyOptions& options,
 
 Status CholeskyThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   VLOG(3) << "type=" << PrimitiveType_Name(type_)
           << " uplo=" << se::blas::UpperLowerString(uplo_)
           << " batch_size=" << batch_size_ << " n=" << n_
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
index cde245a7e8b..eb6f02baa8c 100644
--- a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
@@ -52,7 +52,7 @@ class CholeskyThunk : public Thunk {
   CholeskyThunk& operator=(const CholeskyThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index ea639249826..90f797e7e15 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -58,7 +58,7 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable,
 
 Status ConditionalThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& run_id, HloExecutionProfiler* profiler) {
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
   // Copy the predicate value from device.
   int32 branch_index = -1;
@@ -89,7 +89,7 @@ Status ConditionalThunk::ExecuteOnStream(
   // Execute the branch computation corresponding to the value of branch_index.
   profiler->StartHloComputation();
   TF_RETURN_IF_ERROR(branch_thunks_[branch_index]->ExecuteOnStream(
-      buffer_allocations, stream, profiler));
+      buffer_allocations, stream, run_id, profiler));
   profiler->FinishHloComputation(
       hlo_instruction()->branch_computation(branch_index));
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index c0093ca6397..ca625f4a97b 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -54,7 +54,7 @@ class ConditionalThunk : public Thunk {
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index e1dffad3045..265a3f67020 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -43,7 +43,7 @@ ConvolutionThunk::ConvolutionThunk(
 
 Status ConvolutionThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   std::vector<se::DeviceMemoryBase> operand_se_buffers;
   for (const auto& buffer : operand_buffers_) {
     operand_se_buffers.push_back(buffer_allocations.GetDeviceAddress(buffer));
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index c71515490c9..4a29164cbe6 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -54,7 +54,7 @@ class ConvolutionThunk : public Thunk {
 
   // Does the convolution for the thunk on "stream".
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
index 92e03f94c11..62878cf864d 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
@@ -32,7 +32,7 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
 
 Status HostToDeviceCopyThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
@@ -51,7 +51,7 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
 
 Status DeviceToDeviceCopyThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   se::DeviceMemoryBase source_data =
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
index 91564b520ac..30fb71f4c4e 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@@ -41,7 +41,7 @@ class HostToDeviceCopyThunk : public Thunk {
   HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
@@ -65,7 +65,7 @@ class DeviceToDeviceCopyThunk : public Thunk {
   DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
index bc3c6f72f67..3147bc66e3f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@@ -100,7 +100,7 @@ CudnnBatchNormForwardInferenceThunk::CudnnBatchNormForwardInferenceThunk(
 
 Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   dnn::BatchDescriptor operand_desc;
   dnn::BatchDescriptor scale_offset_desc;
   std::tie(operand_desc, scale_offset_desc) =
@@ -114,17 +114,19 @@ Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream(
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(offset_)),
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(mean_)),
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(variance_)),
-      operand_desc,                //
-      scale_offset_desc,           //
-      epsilon_,                    //
-      &output,                     //
-      /*batch_mean=*/nullptr,      //
-      /*batch_var=*/nullptr,       //
-      /*saved_mean=*/nullptr,      //
-      /*saved_inv_var=*/nullptr,   //
-      /*is_training=*/false,       //
-      /*var_to_inv_var=*/nullptr,  //
-      /*inv_var_to_var=*/nullptr);
+      operand_desc,                         //
+      scale_offset_desc,                    //
+      epsilon_,                             //
+      &output,                              //
+      /*batch_mean=*/nullptr,               //
+      /*batch_var=*/nullptr,                //
+      /*saved_mean=*/nullptr,               //
+      /*saved_inv_var=*/nullptr,            //
+      /*is_training=*/false,                //
+      /*var_to_inv_var=*/nullptr,           //
+      /*inv_var_to_var=*/nullptr,           //
+      /*reserve_space_allocator=*/nullptr,  //
+      /*workspace_allocator=*/nullptr);
 
   if (!stream->ok()) {
     return InternalError("BatchNormalizationForward call failed.");
@@ -162,7 +164,7 @@ CudnnBatchNormForwardTrainingThunk::CudnnBatchNormForwardTrainingThunk(
 
 Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   dnn::BatchDescriptor operand_desc;
   dnn::BatchDescriptor scale_offset_desc;
   // The BatchNormTraining HLO outputs a tuple of three elements: output data,
@@ -196,7 +198,9 @@ Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream(
       /*saved_inv_var=*/&output_inv_stddev,  //
       /*is_training=*/true,                  //
       /*var_to_inv_var=*/nullptr,            //
-      /*inv_var_to_var=*/nullptr);
+      /*inv_var_to_var=*/nullptr,            //
+      /*reserve_space_allocator=*/nullptr,   //
+      /*workspace_allocator=*/nullptr);
 
   // Write the tuple.
   void* ptrs[] = {output_data.opaque(), output_mean.opaque(),
@@ -246,7 +250,7 @@ CudnnBatchNormBackwardThunk::CudnnBatchNormBackwardThunk(
 
 Status CudnnBatchNormBackwardThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   dnn::BatchDescriptor operand_desc;
   dnn::BatchDescriptor scale_offset_desc;
 
@@ -272,7 +276,7 @@ Status CudnnBatchNormBackwardThunk::ExecuteOnStream(
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(mean_)),
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(inv_stddev_)),
       operand_desc, scale_offset_desc, epsilon_, &output_grad_data,
-      &output_grad_scale, &output_grad_offset);
+      &output_grad_scale, &output_grad_offset, nullptr, nullptr);
 
   // Write the output tuple.
   void* ptrs[] = {output_grad_data.opaque(), output_grad_scale.opaque(),
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
index d2143b39529..e0e6e86818f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
@@ -61,7 +61,7 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
       const CudnnBatchNormForwardInferenceThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
@@ -92,7 +92,7 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
       const CudnnBatchNormForwardTrainingThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
@@ -126,7 +126,7 @@ class CudnnBatchNormBackwardThunk : public Thunk {
       delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 9ef5f07d857..1e9c3d83c56 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -178,8 +178,8 @@ struct ConvCacheStats {
   int64 cache_misses = 0;
 
   void LogStats() {
-    VLOG(1) << "Cache hits: " << cache_hits;
-    VLOG(1) << "Cache misses: " << cache_misses;
+    VLOG(2) << "Cache hits: " << cache_hits;
+    VLOG(2) << "Cache misses: " << cache_misses;
   }
 };
 
@@ -269,8 +269,7 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
   if (allocator_ != nullptr) {
     allocator = allocator_;
   } else {
-    se_allocator.emplace(stream_exec_->platform(),
-                         absl::Span<se::StreamExecutor* const>({stream_exec_}));
+    se_allocator.emplace(stream_exec_);
     allocator = &*se_allocator;
   }
 
@@ -302,12 +301,15 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
         break;
       }
       case xla::F32: {
-        uint32 bits;
-        memcpy(&bits, &kBroadcastedConstant, sizeof(bits));
-        stream.ThenMemset32(&buffer, bits, buffer.size());
+        se::DeviceMemory<float> typed_buffer(buffer);
+        stream.ThenPopulateRandUniform(&typed_buffer);
+        break;
+      }
+      case xla::F64: {
+        se::DeviceMemory<double> typed_buffer(buffer);
+        stream.ThenPopulateRandUniform(&typed_buffer);
         break;
       }
-      // TODO(timshen): populate non-zero data for f64.
       default:
         stream.ThenMemZero(&buffer, buffer.size());
     }
@@ -425,6 +427,8 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
             << AlgorithmToString(first_algorithm) << " vs "
             << AlgorithmToString(alg);
         PrintPlatformInfo(&stream);
+        VLOG(1) << "Full module on failure: \n"
+                << instr->GetModule()->ToString();
         auto* fail = result.mutable_failure();
         fail->set_kind(AutotuneResult::WRONG_RESULT);
         auto* reference_conv = fail->mutable_reference_conv();
@@ -462,12 +466,10 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
     *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
     log.set_device_pci_bus_id(
         stream_exec_->GetDeviceDescription().pci_bus_id());
+    VLOG(1) << "Autotuning result: " << log.ShortDebugString();
     // If we crash on checking failure, we are in a testing/benchmark mode, thus
-    // print more information instead of logging to the logger.
-    if (crash_on_checking_failure) {
-      LOG(INFO) << "Autotuning result: " << log.ShortDebugString();
-    } else {
-      VLOG(2) << "Autotuning result:\n" << log.DebugString();
+    // omitting logging through the logger.
+    if (!crash_on_checking_failure) {
       tensorflow::Logger::Singleton()->LogProto(log);
     }
   }
@@ -527,7 +529,7 @@ StatusOr<bool> CudnnConvAlgorithmPicker::RunOnInstruction(
   }
 
   auto best_algo = std::move(best_algo_or).ValueOrDie();
-  VLOG(1) << "Setting cudnn conv to use algorithm "
+  VLOG(2) << "Setting cudnn conv to use algorithm "
           << best_algo.conv().algorithm() << " and "
           << NumBytesToString(best_algo.scratch_bytes())
           << " of scratch memory: " << instr->ToString()
@@ -548,7 +550,7 @@ StatusOr<bool> CudnnConvAlgorithmPicker::RunOnInstruction(
   HloInstruction* new_call = computation->AddInstruction(
       instr->CloneWithNewOperands(new_call_shape, instr->operands()));
 
-  VLOG(1) << "Replacing convolution " << instr->ToString() << " with "
+  VLOG(2) << "Replacing convolution " << instr->ToString() << " with "
           << new_call->ToString();
 
   TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index cd0198e2cb9..c2817e36466 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -39,42 +39,6 @@ using se::dnn::FilterDescriptor;
 using se::dnn::FilterLayout;
 using se::dnn::ProfileResult;
 
-struct CudnnConvParams {
-  // Here are the fields related to cuDNN's fused convolution. The result thus
-  // is defined as:
-  //   activation(conv_result_scale * conv(x, w) +
-  //       side_input_scale * side_input + broadcast(bias))
-  //
-  // The most common fused conv is conv forward + relu/identity, for example.
-  //
-  // bias_buf is a single-dimensional array, with the length equal to the number
-  // of output features. It'll be broadcasted to the output shape in order to be
-  // added to the final results.
-  //
-  // side_input_buf, if valid, must have the same shape as the output buffer.
-  struct FusionParams {
-    se::dnn::ActivationMode mode;
-    double side_input_scale;
-    se::DeviceMemoryBase bias_buf;
-    se::DeviceMemoryBase side_input_buf;  // nullable
-  };
-
-  CudnnConvKind kind;
-  const Shape* input_shape;
-  const Shape* filter_shape;
-  const Shape* output_shape;
-  se::DeviceMemoryBase input_buf;
-  se::DeviceMemoryBase filter_buf;
-  se::DeviceMemoryBase output_buf;
-  const Window* window;
-  const ConvolutionDimensionNumbers* dnums;
-  int64 feature_group_count;
-  se::dnn::AlgorithmConfig algorithm;
-  double conv_result_scale;
-
-  absl::optional<FusionParams> fusion;
-};
-
 // A StreamExecutor ScratchAllocator that wraps a single XLA allocation,
 // returning it (in its entirety) the first time Allocate() is called.
 class ScratchBufAllocator : public se::ScratchAllocator {
@@ -110,132 +74,19 @@ class ScratchBufAllocator : public se::ScratchAllocator {
 };
 
 template <typename T>
-Status RunCudnnConvImpl(CudnnConvParams params,
+Status RunCudnnConvImpl(const CudnnConvParams& params,
                         se::ScratchAllocator* scratch_allocator,
-                        se::Stream* stream,
-                        se::dnn::ProfileResult* profile_result) {
-  CudnnConvKind kind = params.kind;
-  const Shape& input_shape = *params.input_shape;
-  const Shape& filter_shape = *params.filter_shape;
-  const Shape& output_shape = *params.output_shape;
-  DeviceMemory<T> input_buf(params.input_buf);
-  DeviceMemory<T> filter_buf(params.filter_buf);
-  DeviceMemory<T> output_buf(params.output_buf);
-  const Window& window = *params.window;
-  const ConvolutionDimensionNumbers& dnums = *params.dnums;
-  int64 feature_group_count = params.feature_group_count;
+                        se::Stream* stream, RunConvOptions options) {
+  auto input_buf = se::DeviceMemory<T>(params.input_buf);
+  auto filter_buf = se::DeviceMemory<T>(params.filter_buf);
+  auto output_buf = se::DeviceMemory<T>(params.output_buf);
   AlgorithmConfig algorithm = params.algorithm;
 
-  VLOG(3) << "Convolution Algorithm: " << algorithm.algorithm()->algo_id();
-  VLOG(3) << "tensor_ops_enabled: "
-          << algorithm.algorithm()->tensor_ops_enabled();
-  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(kind);
-  VLOG(3) << "input shape: " << ShapeUtil::HumanStringWithLayout(input_shape);
-  VLOG(3) << "filter shape: " << ShapeUtil::HumanStringWithLayout(filter_shape);
-  VLOG(3) << "Output shape: " << ShapeUtil::HumanStringWithLayout(output_shape);
-  VLOG(3) << "Window: { " << window.ShortDebugString() << " }";
-  VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }";
-
-  const int num_dimensions = window.dimensions_size();
-  CHECK_LE(num_dimensions, 3);
-  CHECK_GE(num_dimensions, 1);
-  // cuDNN does not support 1D convolutions. We therefore express 1D
-  // convolutions as 2D convolutions where the first spatial dimension is 1.
-  // This matches the behavior of TF (see definition of conv1d in
-  // tensorflow/python/ops/nn_ops.py).
-  const int effective_num_dimensions = std::max(2, num_dimensions);
-
-  CHECK_EQ(primitive_util::NativeToPrimitiveType<T>(),
-           output_shape.element_type())
-      << ShapeUtil::HumanString(output_shape);
-
-  // If one dimension is reversed, we need to have all dimensions reversed (so
-  // we're doing convolution not cross correlation).
-  const bool dims_reversed = window.dimensions()[0].window_reversal();
-
-  CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
-  for (const WindowDimension& dim : window.dimensions()) {
-    CHECK_EQ(dims_reversed, dim.window_reversal());
-    CHECK_EQ(dim.padding_low(), dim.padding_high());
-    CHECK_EQ(dim.base_dilation(), 1)
-        << "cudnn does not support base dilation; it "
-           "must be made explicit with a kPad";
-    CHECK_EQ(dim.window_dilation(), 1)
-        << "XLA does not support window dilation (although cudnn does); it "
-           "must be made explicit with a kPad";
+  if (options.algo_override) {
+    algorithm = AlgorithmConfig(*options.algo_override);
   }
 
-  // cuDNN's convolution APIs support the BDYX layout for activations/output and
-  // the OIYX layout for weights.
-  DataLayout input_dl;
-  FilterLayout filter_dl;
-  DataLayout output_dl;
-
-  TF_ASSIGN_OR_RETURN(std::tie(input_dl, filter_dl, output_dl),
-                      XlaConvLayoutsToStreamExecutorLayouts(
-                          dnums, input_shape.layout(), filter_shape.layout(),
-                          output_shape.layout()));
-
-  BatchDescriptor input_descriptor(effective_num_dimensions);
-  input_descriptor.set_layout(input_dl)
-      .set_feature_map_count(
-          input_shape.dimensions(dnums.input_feature_dimension()))
-      .set_count(input_shape.dimensions(dnums.input_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    // Note that the dimensions are reversed. The same holds below.
-    input_descriptor.set_spatial_dim(
-        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-        input_shape.dimensions(dnums.input_spatial_dimensions(dim)));
-  }
-
-  FilterDescriptor filter_descriptor(effective_num_dimensions);
-  filter_descriptor.set_layout(filter_dl)
-      .set_input_feature_map_count(
-          filter_shape.dimensions(dnums.kernel_input_feature_dimension()))
-      .set_output_feature_map_count(
-          filter_shape.dimensions(dnums.kernel_output_feature_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    filter_descriptor.set_spatial_dim(
-        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-        filter_shape.dimensions(dnums.kernel_spatial_dimensions(dim)));
-  }
-
-  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
-  convolution_descriptor.set_group_count(feature_group_count);
-  convolution_descriptor.set_convolution_not_crosscorr(dims_reversed);
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    convolution_descriptor
-        .set_zero_padding(
-            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-            window.dimensions(dim).padding_low())
-        .set_filter_stride(
-            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-            window.dimensions(dim).stride());
-  }
-
-  BatchDescriptor output_descriptor(effective_num_dimensions);
-  output_descriptor.set_layout(output_dl)
-      .set_feature_map_count(
-          output_shape.dimensions(dnums.output_feature_dimension()))
-      .set_count(output_shape.dimensions(dnums.output_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    output_descriptor.set_spatial_dim(
-        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-        output_shape.dimensions(dnums.output_spatial_dimensions(dim)));
-  }
-
-  // Add a singleton dimension in the 1D convolution case.
-  if (num_dimensions == 1) {
-    input_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
-    output_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
-    filter_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
-    convolution_descriptor.set_zero_padding(static_cast<DimIndex>(0), 0)
-        .set_filter_stride(static_cast<DimIndex>(0), 1);
-  }
-
-  switch (kind) {
+  switch (params.kind) {
     case CudnnConvKind::kForward:
       if (params.conv_result_scale != 1) {
         return InternalError(
@@ -243,9 +94,9 @@ Status RunCudnnConvImpl(CudnnConvParams params,
             params.conv_result_scale);
       }
       stream->ThenConvolveWithAlgorithm(
-          input_descriptor, input_buf, filter_descriptor, filter_buf,
-          convolution_descriptor, output_descriptor, &output_buf,
-          scratch_allocator, algorithm, profile_result);
+          params.input_descriptor, input_buf, params.filter_descriptor,
+          filter_buf, params.conv_desc, params.output_descriptor, &output_buf,
+          scratch_allocator, algorithm, options.profile_result);
       break;
     case CudnnConvKind::kBackwardInput:
       if (params.conv_result_scale != 1) {
@@ -254,9 +105,9 @@ Status RunCudnnConvImpl(CudnnConvParams params,
             params.conv_result_scale);
       }
       stream->ThenConvolveBackwardDataWithAlgorithm(
-          filter_descriptor, filter_buf, output_descriptor, output_buf,
-          convolution_descriptor, input_descriptor, &input_buf,
-          scratch_allocator, algorithm, profile_result);
+          params.filter_descriptor, filter_buf, params.output_descriptor,
+          output_buf, params.conv_desc, params.input_descriptor, &input_buf,
+          scratch_allocator, algorithm, options.profile_result);
       break;
     case CudnnConvKind::kBackwardFilter:
       if (params.conv_result_scale != 1) {
@@ -265,18 +116,17 @@ Status RunCudnnConvImpl(CudnnConvParams params,
             params.conv_result_scale);
       }
       stream->ThenConvolveBackwardFilterWithAlgorithm(
-          input_descriptor, input_buf, output_descriptor, output_buf,
-          convolution_descriptor, filter_descriptor, &filter_buf,
-          scratch_allocator, algorithm, profile_result);
+          params.input_descriptor, input_buf, params.output_descriptor,
+          output_buf, params.conv_desc, params.filter_descriptor, &filter_buf,
+          scratch_allocator, algorithm, options.profile_result);
       break;
     case CudnnConvKind::kForwardActivation: {
       BatchDescriptor bias_desc;
       bias_desc.set_count(1)
           .set_height(1)
           .set_width(1)
-          .set_feature_map_count(
-              output_shape.dimensions(dnums.output_feature_dimension()))
-          .set_layout(output_dl);
+          .set_feature_map_count(params.output_descriptor.feature_map_count())
+          .set_layout(params.output_descriptor.layout());
 
       se::DeviceMemory<T> side_input(params.fusion->side_input_buf);
       // If there is no side input, use output as the side input.
@@ -296,12 +146,12 @@ Status RunCudnnConvImpl(CudnnConvParams params,
       }
 
       stream->ThenFusedConvolveWithAlgorithm(
-          input_descriptor, input_buf, params.conv_result_scale,
-          filter_descriptor, filter_buf, convolution_descriptor, side_input,
+          params.input_descriptor, input_buf, params.conv_result_scale,
+          params.filter_descriptor, filter_buf, params.conv_desc, side_input,
           params.fusion->side_input_scale, bias_desc,
           DeviceMemory<T>(params.fusion->bias_buf), params.fusion->mode,
-          output_descriptor, &output_buf, scratch_allocator, algorithm,
-          profile_result);
+          params.output_descriptor, &output_buf, scratch_allocator, algorithm,
+          options.profile_result);
       break;
     }
   }
@@ -309,14 +159,14 @@ Status RunCudnnConvImpl(CudnnConvParams params,
   if (!stream->ok()) {
     return InternalError(
         "Unable to launch convolution with type %s and algorithm (%d, %d)",
-        CudnnConvKindToString(kind), algorithm.algorithm()->algo_id(),
+        CudnnConvKindToString(params.kind), algorithm.algorithm()->algo_id(),
         algorithm.algorithm_no_scratch()->algo_id());
   }
   return Status::OK();
 }
 
-// Returns the cudnn convolution parameters generated from conv, which must be a
-// custom-call to a cudnn convolution.
+}  // anonymous namespace
+
 StatusOr<CudnnConvParams> GetCudnnConvParams(
     const HloCustomCallInstruction* conv,
     absl::Span<se::DeviceMemoryBase> operand_buffers,
@@ -325,50 +175,46 @@ StatusOr<CudnnConvParams> GetCudnnConvParams(
 
   TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
                       conv->backend_config<CudnnConvBackendConfig>());
-  TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(conv));
-  const auto& lhs_shape = conv->operand(0)->shape();
-  const auto& rhs_shape = conv->operand(1)->shape();
-  const auto& conv_result_shape = conv->shape().tuple_shapes(0);
+  TF_ASSIGN_OR_RETURN(params.kind, GetCudnnConvKind(conv));
+  const Shape* input_shape;
+  const Shape* filter_shape;
+  const Shape* output_shape;
 
-  params.kind = kind;
-  params.window = &conv->window();
-  params.dnums = &conv->convolution_dimension_numbers();
-  params.feature_group_count = conv->feature_group_count();
   params.algorithm = se::dnn::AlgorithmConfig(se::dnn::AlgorithmDesc(
       backend_config.algorithm(), backend_config.tensor_ops_enabled()));
   params.conv_result_scale = backend_config.conv_result_scale();
 
-  switch (kind) {
+  switch (params.kind) {
     case CudnnConvKind::kForward:
-      params.input_shape = &lhs_shape;
-      params.filter_shape = &rhs_shape;
-      params.output_shape = &conv_result_shape;
+      input_shape = &conv->operand(0)->shape();
+      filter_shape = &conv->operand(1)->shape();
+      output_shape = &conv->shape().tuple_shapes(0);
       params.input_buf = operand_buffers[0];
       params.filter_buf = operand_buffers[1];
       params.output_buf = result_buffer;
       break;
     case CudnnConvKind::kBackwardInput:
-      params.input_shape = &conv_result_shape;
-      params.filter_shape = &rhs_shape;
-      params.output_shape = &lhs_shape;
+      input_shape = &conv->shape().tuple_shapes(0);
+      filter_shape = &conv->operand(1)->shape();
+      output_shape = &conv->operand(0)->shape();
       params.input_buf = result_buffer;
       params.filter_buf = operand_buffers[1];
       params.output_buf = operand_buffers[0];
       break;
     case CudnnConvKind::kBackwardFilter:
-      params.input_shape = &lhs_shape;
-      params.filter_shape = &conv_result_shape;
-      params.output_shape = &rhs_shape;
+      input_shape = &conv->operand(0)->shape();
+      filter_shape = &conv->shape().tuple_shapes(0);
+      output_shape = &conv->operand(1)->shape();
       params.input_buf = operand_buffers[0];
       params.filter_buf = result_buffer;
       params.output_buf = operand_buffers[1];
       break;
     case CudnnConvKind::kForwardActivation: {
-      params.input_shape = &lhs_shape;
-      params.filter_shape = &rhs_shape;
-      params.output_shape = &conv_result_shape;
+      input_shape = &conv->operand(0)->shape();
+      filter_shape = &conv->operand(1)->shape();
+      output_shape = &conv->shape().tuple_shapes(0);
       params.fusion.emplace();
-      auto& fusion = *params.fusion;
+      CudnnConvParams::FusionParams& fusion = *params.fusion;
       if (!se::dnn::ActivationMode_IsValid(backend_config.activation_mode())) {
         return InternalError("Bad activation mode: %s",
                              backend_config.ShortDebugString());
@@ -385,11 +231,129 @@ StatusOr<CudnnConvParams> GetCudnnConvParams(
       }
     }
   }
+
+  const Window& window = conv->window();
+  const ConvolutionDimensionNumbers& dnums =
+      conv->convolution_dimension_numbers();
+
+  VLOG(3) << "Convolution Algorithm: "
+          << params.algorithm.algorithm()->algo_id();
+  VLOG(3) << "tensor_ops_enabled: "
+          << params.algorithm.algorithm()->tensor_ops_enabled();
+  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(params.kind);
+  VLOG(3) << "input shape: " << ShapeUtil::HumanStringWithLayout(*input_shape);
+  VLOG(3) << "filter shape: "
+          << ShapeUtil::HumanStringWithLayout(*filter_shape);
+  VLOG(3) << "Output shape: "
+          << ShapeUtil::HumanStringWithLayout(*output_shape);
+  VLOG(3) << "Window: { " << window.ShortDebugString() << " }";
+  VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }";
+
+  const int num_dimensions = window.dimensions_size();
+  CHECK_LE(num_dimensions, 3) << conv->ToString();
+  CHECK_GE(num_dimensions, 1) << conv->ToString();
+  // cuDNN does not support 1D convolutions. We therefore express 1D
+  // convolutions as 2D convolutions where the first spatial dimension is 1.
+  // This matches the behavior of TF (see definition of conv1d in
+  // tensorflow/python/ops/nn_ops.py).
+  const int effective_num_dimensions = std::max(2, num_dimensions);
+
+  // If one dimension is reversed, we need to have all dimensions reversed (so
+  // we're doing convolution not cross correlation).
+  const bool dims_reversed = window.dimensions()[0].window_reversal();
+
+  CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size())
+      << conv->ToString();
+  CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size())
+      << conv->ToString();
+  CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size())
+      << conv->ToString();
+  for (const WindowDimension& dim : window.dimensions()) {
+    CHECK_EQ(dims_reversed, dim.window_reversal()) << conv->ToString();
+    CHECK_EQ(dim.padding_low(), dim.padding_high()) << conv->ToString();
+    CHECK_EQ(dim.base_dilation(), 1)
+        << "cudnn does not support base dilation; it "
+           "must be made explicit with a kPad: "
+        << conv->ToString();
+  }
+
+  // cuDNN's convolution APIs support the BDYX layout for activations/output and
+  // the OIYX layout for weights.
+  DataLayout input_dl;
+  FilterLayout filter_dl;
+  DataLayout output_dl;
+
+  TF_ASSIGN_OR_RETURN(std::tie(input_dl, filter_dl, output_dl),
+                      XlaConvLayoutsToStreamExecutorLayouts(
+                          dnums, input_shape->layout(), filter_shape->layout(),
+                          output_shape->layout()));
+
+  BatchDescriptor& input_descriptor = params.input_descriptor;
+  input_descriptor = BatchDescriptor(effective_num_dimensions);
+  input_descriptor.set_layout(input_dl)
+      .set_feature_map_count(
+          input_shape->dimensions(dnums.input_feature_dimension()))
+      .set_count(input_shape->dimensions(dnums.input_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    // Note that the dimensions are reversed. The same holds below.
+    input_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        input_shape->dimensions(dnums.input_spatial_dimensions(dim)));
+  }
+
+  FilterDescriptor& filter_descriptor = params.filter_descriptor;
+  filter_descriptor = FilterDescriptor(effective_num_dimensions);
+  filter_descriptor.set_layout(filter_dl)
+      .set_input_feature_map_count(
+          filter_shape->dimensions(dnums.kernel_input_feature_dimension()))
+      .set_output_feature_map_count(
+          filter_shape->dimensions(dnums.kernel_output_feature_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    filter_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        filter_shape->dimensions(dnums.kernel_spatial_dimensions(dim)));
+  }
+
+  params.conv_desc = ConvolutionDescriptor(effective_num_dimensions);
+  params.conv_desc.set_group_count(conv->feature_group_count());
+  params.conv_desc.set_convolution_not_crosscorr(dims_reversed);
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    params.conv_desc
+        .set_zero_padding(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).padding_low())
+        .set_filter_stride(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).stride())
+        .set_dilation_rate(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).window_dilation());
+  }
+
+  BatchDescriptor& output_descriptor = params.output_descriptor;
+  output_descriptor = BatchDescriptor(effective_num_dimensions);
+  output_descriptor.set_layout(output_dl)
+      .set_feature_map_count(
+          output_shape->dimensions(dnums.output_feature_dimension()))
+      .set_count(output_shape->dimensions(dnums.output_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    output_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        output_shape->dimensions(dnums.output_spatial_dimensions(dim)));
+  }
+
+  // Add a singleton dimension in the 1D convolution case.
+  if (num_dimensions == 1) {
+    input_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    output_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    filter_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    params.conv_desc.set_zero_padding(static_cast<DimIndex>(0), 0)
+        .set_filter_stride(static_cast<DimIndex>(0), 1);
+  }
+
   return params;
 }
 
-}  // anonymous namespace
-
 Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
@@ -408,24 +372,20 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
   TF_ASSIGN_OR_RETURN(CudnnConvParams params,
                       GetCudnnConvParams(conv, operand_buffers, result_buffer));
 
-  if (options.algo_override) {
-    params.algorithm = AlgorithmConfig(*options.algo_override);
-  }
-
   PrimitiveType output_primitive_type =
       conv->shape().tuple_shapes(0).element_type();
   switch (output_primitive_type) {
     case F16:
       return RunCudnnConvImpl<Eigen::half>(params, scratch_allocator, stream,
-                                           options.profile_result);
+                                           options);
     case F32:
       return RunCudnnConvImpl<float>(params, scratch_allocator, stream,
-                                     options.profile_result);
+                                     options);
     case F64:
       return RunCudnnConvImpl<double>(params, scratch_allocator, stream,
-                                      options.profile_result);
+                                      options);
     default:
-      LOG(FATAL) << ShapeUtil::HumanString(*params.output_shape);
+      LOG(FATAL) << conv->ToString();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
index 25b2461ca61..14124a08369 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_RUNNER_H_
 
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -36,6 +37,41 @@ struct RunConvOptions {
   absl::optional<se::dnn::AlgorithmDesc> algo_override;
 };
 
+// Implementation struct exposed for debugging and log analysis.
+struct CudnnConvParams {
+  // Here are the fields related to cuDNN's fused convolution. The result thus
+  // is defined as:
+  //   activation(conv_result_scale * conv(x, w) +
+  //       side_input_scale * side_input + broadcast(bias))
+  //
+  // The most common fused conv is conv forward + relu/identity, for example.
+  //
+  // bias_buf is a single-dimensional array, with the length equal to the number
+  // of output features. It'll be broadcasted to the output shape in order to be
+  // added to the final results.
+  //
+  // side_input_buf, if valid, must have the same shape as the output buffer.
+  struct FusionParams {
+    se::dnn::ActivationMode mode;
+    double side_input_scale;
+    se::DeviceMemoryBase bias_buf;
+    se::DeviceMemoryBase side_input_buf;  // nullable
+  };
+
+  CudnnConvKind kind;
+  se::dnn::BatchDescriptor input_descriptor;
+  se::dnn::FilterDescriptor filter_descriptor;
+  se::dnn::BatchDescriptor output_descriptor;
+  se::DeviceMemoryBase input_buf;
+  se::DeviceMemoryBase filter_buf;
+  se::DeviceMemoryBase output_buf;
+  se::dnn::ConvolutionDescriptor conv_desc;
+  se::dnn::AlgorithmConfig algorithm;
+  double conv_result_scale;
+
+  absl::optional<FusionParams> fusion;
+};
+
 // This file contains low-level routines for running cudnn convolutions.
 
 // Calls into cudnn to run the specified convolution.
@@ -62,6 +98,12 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     se::ScratchAllocator* scratch_allocator, se::Stream* stream,
                     RunConvOptions = {});
 
+// Implementation details exposed for debugging and log analysis.
+StatusOr<CudnnConvParams> GetCudnnConvParams(
+    const HloCustomCallInstruction* conv,
+    absl::Span<se::DeviceMemoryBase> operand_buffers,
+    se::DeviceMemoryBase result_buffer);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.cc b/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
index 923b7bc4528..4103a720c98 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.cc
@@ -91,12 +91,14 @@ StatusOr<CusolverContext> CusolverContext::Create(se::Stream* stream) {
   TF_RETURN_IF_ERROR(CusolverStatusToStatus(cusolverDnCreate(&handle)));
   CusolverContext context(stream, handle);
 
-  // StreamExecutor really should just expose the Cuda stream to clients...
-  const cudaStream_t* cuda_stream =
-      CHECK_NOTNULL(reinterpret_cast<const cudaStream_t*>(
-          stream->implementation()->GpuStreamMemberHack()));
-  TF_RETURN_IF_ERROR(
-      CusolverStatusToStatus(cusolverDnSetStream(handle, *cuda_stream)));
+  if (stream) {
+    // StreamExecutor really should just expose the Cuda stream to clients...
+    const cudaStream_t* cuda_stream =
+        CHECK_NOTNULL(reinterpret_cast<const cudaStream_t*>(
+            stream->implementation()->GpuStreamMemberHack()));
+    TF_RETURN_IF_ERROR(
+        CusolverStatusToStatus(cusolverDnSetStream(handle, *cuda_stream)));
+  }
 
   return std::move(context);
 }
@@ -131,17 +133,40 @@ CusolverContext::~CusolverContext() {
 
 #define DN_SOLVER_FN(method, type_prefix) cusolverDn##type_prefix##method
 
-#define POTRF_BUFFER_SIZE_INSTANCE(T, type_prefix)                            \
-  StatusOr<int64> CusolverContext::PotrfBufferSize(                           \
-      se::blas::UpperLower uplo, int n, se::DeviceMemory<T> A, int lda) {     \
-    int size = -1;                                                            \
-    TF_RETURN_IF_ERROR(CusolverStatusToStatus(DN_SOLVER_FN(                   \
-        potrf_bufferSize, type_prefix)(handle(), CUDABlasUpperLower(uplo), n, \
-                                       ToDevicePointer(A), lda, &size)));     \
-    return size;                                                              \
+// Note: NVidia have promised that it is safe to pass 'nullptr' as the argument
+// buffers to cuSolver buffer size methods and this will be a documented
+// behavior in a future cuSolver release.
+StatusOr<int64> CusolverContext::PotrfBufferSize(PrimitiveType type,
+                                                 se::blas::UpperLower uplo,
+                                                 int n, int lda) {
+  int size = -1;
+  switch (type) {
+    case F32: {
+      TF_RETURN_IF_ERROR(CusolverStatusToStatus(cusolverDnSpotrf_bufferSize(
+          handle(), CUDABlasUpperLower(uplo), n, /*A=*/nullptr, lda, &size)));
+      break;
+    }
+    case F64: {
+      TF_RETURN_IF_ERROR(CusolverStatusToStatus(cusolverDnDpotrf_bufferSize(
+          handle(), CUDABlasUpperLower(uplo), n, /*A=*/nullptr, lda, &size)));
+      break;
+    }
+    case C64: {
+      TF_RETURN_IF_ERROR(CusolverStatusToStatus(cusolverDnCpotrf_bufferSize(
+          handle(), CUDABlasUpperLower(uplo), n, /*A=*/nullptr, lda, &size)));
+      break;
+    }
+    case C128: {
+      TF_RETURN_IF_ERROR(CusolverStatusToStatus(cusolverDnZpotrf_bufferSize(
+          handle(), CUDABlasUpperLower(uplo), n, /*A=*/nullptr, lda, &size)));
+      break;
+    }
+    default:
+      return InvalidArgument("Invalid type for cholesky decomposition: %s",
+                             PrimitiveType_Name(type));
   }
-
-CALL_LAPACK_TYPES(POTRF_BUFFER_SIZE_INSTANCE);
+  return size;
+}
 
 #define POTRF_INSTANCE(T, type_prefix)                                    \
   Status CusolverContext::Potrf(                                          \
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_context.h b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
index 68b5fb14c6b..c3d075c47c7 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_context.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_context.h
@@ -32,6 +32,8 @@ namespace gpu {
 
 class CusolverContext {
  public:
+  // stream may be nullptr, in which case the context can only be used for
+  // buffer size queries.
   static StatusOr<CusolverContext> Create(se::Stream* stream);
   CusolverContext() = default;
   ~CusolverContext();
@@ -63,17 +65,9 @@ class CusolverContext {
                se::DeviceMemory<std::complex<double>> workspace);
 
   // Returns the size of the `workspace` required by Potrf, in number of
-  // elements of size T.
-  StatusOr<int64> PotrfBufferSize(se::blas::UpperLower uplo, int n,
-                                  se::DeviceMemory<float> dev_A, int lda);
-  StatusOr<int64> PotrfBufferSize(se::blas::UpperLower uplo, int n,
-                                  se::DeviceMemory<double> dev_A, int lda);
-  StatusOr<int64> PotrfBufferSize(se::blas::UpperLower uplo, int n,
-                                  se::DeviceMemory<std::complex<float>> dev_A,
-                                  int lda);
-  StatusOr<int64> PotrfBufferSize(se::blas::UpperLower uplo, int n,
-                                  se::DeviceMemory<std::complex<double>> dev_A,
-                                  int lda);
+  // elements of `type`.
+  StatusOr<int64> PotrfBufferSize(PrimitiveType type, se::blas::UpperLower uplo,
+                                  int n, int lda);
 
  private:
   CusolverContext(se::Stream* stream, cusolverDnHandle_t handle);
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
index 2ba6e8fc3c5..64c3c319321 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/scratch_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -31,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/stream_executor/blas.h"
 
 namespace xla {
@@ -48,7 +46,6 @@ void SetFortranLayout(Shape* shape) {
 }
 
 StatusOr<HloInstruction*> CreateCholesky(CusolverContext* context,
-                                         ScratchAllocator* allocator,
                                          HloInstruction* operand,
                                          const CholeskyOptions& options,
                                          const OpMetadata& metadata) {
@@ -67,39 +64,8 @@ StatusOr<HloInstruction*> CreateCholesky(CusolverContext* context,
   se::blas::UpperLower uplo = options.lower() ? se::blas::UpperLower::kLower
                                               : se::blas::UpperLower::kUpper;
   int64 workspace_size;  // Number of elements of size a_shape.element_type()
-  switch (a_shape.element_type()) {
-    case F32: {
-      TF_ASSIGN_OR_RETURN(auto a,
-                          allocator->Allocate<float>(context->stream(), n * n));
-      TF_ASSIGN_OR_RETURN(workspace_size,
-                          context->PotrfBufferSize(uplo, n, a, n));
-      break;
-    }
-    case F64: {
-      TF_ASSIGN_OR_RETURN(
-          auto a, allocator->Allocate<double>(context->stream(), n * n));
-      TF_ASSIGN_OR_RETURN(workspace_size,
-                          context->PotrfBufferSize(uplo, n, a, n));
-      break;
-    }
-    case C64: {
-      TF_ASSIGN_OR_RETURN(auto a, allocator->Allocate<std::complex<float>>(
-                                      context->stream(), n * n));
-      TF_ASSIGN_OR_RETURN(workspace_size,
-                          context->PotrfBufferSize(uplo, n, a, n));
-      break;
-    }
-    case C128: {
-      TF_ASSIGN_OR_RETURN(auto a, allocator->Allocate<std::complex<double>>(
-                                      context->stream(), n * n));
-      TF_ASSIGN_OR_RETURN(workspace_size,
-                          context->PotrfBufferSize(uplo, n, a, n));
-      break;
-    }
-    default:
-      return InvalidArgument("Invalid type for cholesky decomposition: %s",
-                             a_shape.ToString());
-  }
+  TF_ASSIGN_OR_RETURN(workspace_size, context->PotrfBufferSize(
+                                          a_shape.element_type(), uplo, n, n));
 
   // TODO(phawkins): Ideally we would relax this constraint. What we actually
   // want is that:
@@ -131,7 +97,6 @@ StatusOr<HloInstruction*> CreateCholesky(CusolverContext* context,
 
 // Tries to rewrite a single convolution into a call to cudnn.
 StatusOr<bool> RunOnInstruction(CusolverContext* context,
-                                ScratchAllocator* allocator,
                                 HloInstruction* instruction) {
   if (instruction->opcode() != HloOpcode::kCholesky) {
     return false;
@@ -139,7 +104,7 @@ StatusOr<bool> RunOnInstruction(CusolverContext* context,
 
   TF_ASSIGN_OR_RETURN(
       HloInstruction * custom_call,
-      CreateCholesky(context, allocator, instruction->mutable_operand(0),
+      CreateCholesky(context, instruction->mutable_operand(0),
                      instruction->cholesky_options(), instruction->metadata()));
 
   VLOG(1) << "Replacing " << instruction->ToString() << " with "
@@ -167,41 +132,18 @@ StatusOr<bool> CusolverRewriter::RunOnComputation(HloComputation* computation) {
     return false;
   }
 
-  // Create a stream for us to do our work on. We don't really need to do any
-  // work, just allocate memory, but that's the cuSolver API.
-  se::Stream stream{stream_exec_};
-  stream.Init();
-  const auto device_ordinal = stream_exec_->device_ordinal();
-
-  // allocator either points to this->allocator_ or, if that's null, to a
-  // se::StreamExecutorMemoryAllocator for stream_exec_.
-  se::DeviceMemoryAllocator* allocator;
-  absl::optional<se::StreamExecutorMemoryAllocator> se_allocator;
-  if (allocator_ != nullptr) {
-    allocator = allocator_;
-  } else {
-    se_allocator.emplace(stream_exec_->platform(),
-                         absl::Span<se::StreamExecutor* const>({stream_exec_}));
-    allocator = &*se_allocator;
-  }
-  ScratchAllocator scratch_allocator(device_ordinal, allocator);
-
   TF_ASSIGN_OR_RETURN(CusolverContext context,
-                      CusolverContext::Create(&stream));
+                      CusolverContext::Create(/*stream=*/nullptr));
 
   bool changed = false;
   for (HloInstruction* instruction : cusolver_calls) {
-    TF_ASSIGN_OR_RETURN(
-        bool result,
-        RunOnInstruction(&context, &scratch_allocator, instruction));
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(&context, instruction));
     changed |= result;
   }
   return changed;
 }
 
-CusolverRewriter::CusolverRewriter(se::StreamExecutor* stream_exec,
-                                   se::DeviceMemoryAllocator* allocator)
-    : stream_exec_(stream_exec), allocator_(allocator) {}
+CusolverRewriter::CusolverRewriter() = default;
 
 StatusOr<bool> CusolverRewriter::Run(HloModule* module) {
   bool changed = false;
diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
index d8c2cc55872..8be7cd5c947 100644
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
@@ -29,17 +29,13 @@ namespace gpu {
 // Rewrites Cholesky calls into CustomCall HLOs that call into cuSolver.
 class CusolverRewriter : public HloModulePass {
  public:
-  CusolverRewriter(se::StreamExecutor* stream_exec,
-                   se::DeviceMemoryAllocator* allocator);
+  CusolverRewriter();
   absl::string_view name() const override { return "cusolver-rewriter"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
   StatusOr<bool> RunOnComputation(HloComputation* computation);
-
-  se::StreamExecutor* stream_exec_;   // never null
-  se::DeviceMemoryAllocator* allocator_;  // may be null
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
index f0f3152ac98..b521e36108b 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
@@ -50,7 +50,7 @@ CustomCallThunk::CustomCallThunk(
 
 Status CustomCallThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   // gpu_stream is CUstream or e.g. the equivalent type in ROCm.
   auto gpu_stream = se::gpu::AsGpuStreamValue(stream);
   auto typed_call_target =
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
index 9011fa26ffa..6db7950e8e0 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
@@ -45,7 +45,7 @@ class CustomCallThunk : public Thunk {
       const HloInstruction* instr);
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index 1609f0d60c4..55300a8d33a 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -107,7 +107,7 @@ FftThunk::FftThunk(FftType fft_type, absl::Span<const int64> fft_length,
       output_shape_(output_shape) {}
 
 Status FftThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                 se::Stream* stream,
+                                 se::Stream* stream, const RunId& /*run_id*/,
                                  HloExecutionProfiler* profiler) {
   VLOG(3) << "FFT type: " << FftTypeToString(fft_type_);
   VLOG(3) << "Input shape: " << ShapeUtil::HumanStringWithLayout(input_shape_);
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index f653e4f12fe..12718db873b 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -73,7 +73,7 @@ class FftThunk : public Thunk {
 
   // Does the FFT for the thunk on "stream".
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 88f0b4d71c9..ee47fea38c3 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -41,7 +41,7 @@ Status ForThunk::Initialize(const GpuExecutable& executable,
 }
 
 Status ForThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                 se::Stream* stream,
+                                 se::Stream* stream, const RunId& run_id,
                                  HloExecutionProfiler* profiler) {
   VLOG(2) << "Executing ForThunk with " << loop_limit_ << " iters for "
           << (hlo_instruction() ? hlo_instruction()->ToString() : "<null>");
@@ -49,8 +49,8 @@ Status ForThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
   for (int64 i = 0; i < loop_limit_; ++i) {
     profiler->StartHloComputation();
     // Invoke loop body thunk sequence.
-    TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(buffer_allocations,
-                                                             stream, profiler));
+    TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(
+        buffer_allocations, stream, run_id, profiler));
     profiler->FinishHloComputation(hlo_instruction()->while_body());
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index c2d39071b29..e3bef820e57 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -40,7 +40,7 @@ class ForThunk : public Thunk {
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 9bbe1ab5a38..237c065cd73 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -446,7 +446,7 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
       implements_whole_instruction_(implements_whole_instruction) {}
 
 Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                  se::Stream* stream,
+                                  se::Stream* stream, const RunId& /*run_id*/,
                                   HloExecutionProfiler* profiler) {
   auto fn = [&]() {
     switch (output_shape_.element_type()) {
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index e4f07d04820..3cba1d5e169 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -50,7 +50,7 @@ class GemmThunk : public Thunk {
 
   // Does the gemm operation for the thunk on "stream", which must be non-null.
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index dec40c5e49c..8be1655367f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -119,7 +119,8 @@ Status GpuExecutable::ExecuteThunks(
       op_annotation.emplace(
           thunk->hlo_instruction()->ToString(HloPrintOptions::Canonical()),
           absl::StrCat("#tf_op=", hlo->metadata().op_name(),
-                       ",hlo_op=", hlo->name(), "#"));
+                       ",hlo_op=", hlo->name(),
+                       ",hlo_module=", hlo->GetModule()->name(), "#"));
     }
 
     TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
@@ -136,7 +137,8 @@ Status GpuExecutable::ExecuteThunks(
             << thunk->hlo_instruction()->ToString() << " on stream "
             << stream_no;
     TF_RETURN_IF_ERROR(
-        thunk->ExecuteOnStream(buffer_allocations, stream, &profiler));
+        thunk->ExecuteOnStream(buffer_allocations, stream,
+                               run_options->run_options().run_id(), &profiler));
     if (thunk_schedule_->Depended(thunk)) {
       auto finish_event = absl::make_unique<se::Event>(main_stream->parent());
       finish_event->Init();
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 676380c3b10..dbf590591c3 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -28,7 +28,7 @@ InfeedThunk::InfeedThunk(
     : Thunk(Kind::kInfeed, hlo_instruction), infeed_slices_(infeed_slices) {}
 
 Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                    se::Stream* stream,
+                                    se::Stream* stream, const RunId& /*run_id*/,
                                     HloExecutionProfiler* profiler) {
   VLOG(2) << "Infeeding to GPU: " << hlo_instruction()->ToString();
 
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index 59487e245b7..50d9c53d957 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -41,7 +41,7 @@ class InfeedThunk : public Thunk {
   InfeedThunk& operator=(const InfeedThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 957a2f00723..c6919740c87 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -312,17 +312,55 @@ llvm::Value* EmitPrintf(absl::string_view fmt,
        arguments_ptr});
 }
 
+// Helper function to emit call to AMDGPU shfl_down function.
+llvm::Value* EmitAMDGPUShflDown(llvm::Value* value, llvm::Value* offset,
+                                llvm::IRBuilder<>* b) {
+  llvm::Module* module = b->GetInsertBlock()->getModule();
+  CHECK_EQ(value->getType()->getPrimitiveSizeInBits(), 32);
+  auto* i32_ty = b->getInt32Ty();
+  llvm::FunctionCallee shfl_fn = module->getOrInsertFunction(
+      llvm_ir::AsStringRef("__ockl_readuplane_i32"),
+      llvm::FunctionType::get(/*Result=*/i32_ty, {i32_ty, i32_ty},
+                              /*isVarArg=*/false));
+  // AMDGPU device function requires first argument as i32.
+  llvm::Value* result =
+      b->CreateCall(shfl_fn, {b->CreateBitCast(value, i32_ty), offset});
+  // AMDGPU device function always returns an i32 type.
+  return b->CreateBitCast(result, value->getType());
+}
+
+// Helper function to emit call to NVPTX shfl_down intrinsic.
+llvm::Value* EmitNVPTXShflDown(llvm::Value* value, llvm::Value* offset,
+                               llvm::IRBuilder<>* b) {
+  llvm::Module* module = b->GetInsertBlock()->getModule();
+  llvm::Intrinsic::ID llvm_intrinsic_id;
+  CHECK_EQ(value->getType()->getPrimitiveSizeInBits(), 32);
+  if (value->getType()->isFloatTy()) {
+    llvm_intrinsic_id = llvm::Intrinsic::nvvm_shfl_sync_down_f32;
+  } else {
+    llvm_intrinsic_id = llvm::Intrinsic::nvvm_shfl_sync_down_i32;
+  }
+  llvm::Function* intrinsic =
+      llvm::Intrinsic::getDeclaration(module, llvm_intrinsic_id, {});
+  return b->CreateCall(
+      intrinsic, {b->getInt32(-1), value, offset, b->getInt32(kWarpSize - 1)});
+}
+
 llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
                                      llvm::IRBuilder<>* builder) {
   int bit_width = value->getType()->getPrimitiveSizeInBits();
-  llvm::Value* all_warps_mask = builder->getInt32(-1);
+  llvm::Module* module = builder->GetInsertBlock()->getModule();
+  llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
 
   // Special case for efficiency
   if (value->getType()->isFloatTy() && bit_width == 32) {
-    return EmitCallToTargetIntrinsic(
-        TargetIntrinsicID::kShflDownF32,
-        {all_warps_mask, value, offset, builder->getInt32(kWarpSize - 1)}, {},
-        builder);
+    if (target_triple.isNVPTX()) {
+      return EmitNVPTXShflDown(value, offset, builder);
+    } else if (target_triple.getArch() == llvm::Triple::amdgcn) {
+      return EmitAMDGPUShflDown(value, offset, builder);
+    } else {
+      LOG(FATAL) << "Invalid triple " << target_triple.str();
+    }
   }
 
   // We must split values wider than 32 bits as the "shfl" instruction operates
@@ -334,14 +372,17 @@ llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
           builder->getIntNTy(32 * num_segments)),
       llvm::VectorType::get(builder->getInt32Ty(), num_segments));
   for (int i = 0; i < num_segments; ++i) {
-    x = builder->CreateInsertElement(
-        x,
-        EmitCallToTargetIntrinsic(
-            TargetIntrinsicID::kShflDownI32,
-            {all_warps_mask, builder->CreateExtractElement(x, i), offset,
-             builder->getInt32(kWarpSize - 1)},
-            {}, builder),
-        i);
+    llvm::Value* insert_val;
+    if (target_triple.isNVPTX()) {
+      insert_val = EmitNVPTXShflDown(builder->CreateExtractElement(x, i),
+                                     offset, builder);
+    } else if (target_triple.getArch() == llvm::Triple::amdgcn) {
+      insert_val = EmitAMDGPUShflDown(builder->CreateExtractElement(x, i),
+                                      offset, builder);
+    } else {
+      LOG(FATAL) << "Invalid triple " << target_triple.str();
+    }
+    x = builder->CreateInsertElement(x, insert_val, i);
   }
   return builder->CreateBitCast(
       builder->CreateTrunc(
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index fbe22e3a18e..c85b35ed386 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -70,7 +70,7 @@ void KernelThunk::SetLaunchDimensions(const LaunchDimensions& launch_dims) {
 }
 
 Status KernelThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                    se::Stream* stream,
+                                    se::Stream* stream, const RunId& /*run_id*/,
                                     HloExecutionProfiler* profiler) {
   // Load the kernel.
   se::StreamExecutor* executor = stream->parent();
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 2cea89e4e2a..e867904bcf2 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -63,7 +63,7 @@ class KernelThunk : public Thunk {
 
   // Executes the kernel for the thunk on "stream", which must be non-null.
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index ca42807edd1..d025fc99275 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -1,7 +1,6 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 package_group(
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
index 9fd6cf7157e..7a5b14be7b0 100644
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc
@@ -23,7 +23,7 @@ namespace gpu {
 
 Status MemzeroThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
   stream->ThenMemZero(&dest_data, dest_data.size());
@@ -32,7 +32,7 @@ Status MemzeroThunk::ExecuteOnStream(
 
 Status Memset32BitValueThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_);
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
   stream->ThenMemset32(&dest_data, value_, dest_data.size());
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
index d1fec0bd76b..727f2441f39 100644
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
@@ -37,7 +37,7 @@ class MemzeroThunk : public Thunk {
       : Thunk(Kind::kMemzero, hlo), dest_(dest) {}
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
@@ -54,7 +54,7 @@ class Memset32BitValueThunk : public Thunk {
       : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {}
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId&,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index c00edae9540..89def76afe3 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -15,12 +15,24 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
 
-#include "tensorflow/compiler/xla/util.h"
-
 #if GOOGLE_CUDA
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/synchronization/blocking_counter.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "third_party/nccl/nccl.h"
+#include "tensorflow/compiler/xla/refcounting_hash_map.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
@@ -29,6 +41,25 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+// This file runs collective ops (i.e. ops that communicate between multiple
+// GPUs) using NCCL.  Currently only kAllReduce is implemented.
+//
+// Here's a high-level overview of how running an op works.
+//
+//  - Multiple threads call NcclAllReduceThunk::ExecuteOnStream.
+//  - All threads that "go together" (i.e. are participating in the "same"
+//    collective op) choose the same Rendezvous object from a global map.
+//  - Once all threads have arrived at the Rendezvous, we know exactly which
+//    GPUs are participating in the op, so we get or create a NcclClique
+//    containing those GPUs.
+//  - We perform the NCCL operation using the clique, then destroy the
+//    Rendezvous.  The clique is cached, see below.
+//
+// Creating NCCL cliques is expensive, so we cache them.  Our policy is, a thunk
+// keeps alive all cliques it's ever used.  When the thunk is destroyed, it
+// releases its handle on the cliques, and cliques whose refcounts go to 0 are
+// destroyed.
+
 /* static */ bool NcclAllReduceThunk::NcclIsEnabled() {
 #if GOOGLE_CUDA
   return true;
@@ -40,17 +71,145 @@ namespace gpu {
 #if GOOGLE_CUDA
 namespace {
 
-// GPU-replica-driving host threads (i.e. the threads that call
-// GpuExecutable::Execute) build up this structure to describe their
-// participating replica, and then call to
-// GlobalRendezvousManager::SubmitParticipant.
-struct ParticipantData {
-  // Number of replicas particiating in the AllReduce.
-  int64 replica_count;
+// Functions to translate an ncclResult_t/cudaError_t to a Status object.  Used
+// by the macros below.
+Status TranslateStatus(ncclResult_t s, const char* file, int64 line,
+                       const char* expr) {
+  if (s == ncclSuccess) {
+    return Status::OK();
+  }
+  return tensorflow::errors::Internal(
+      absl::StrFormat("%s:%d: NCCL operation %s failed: %s", file, line, expr,
+                      ncclGetErrorString(s)));
+}
 
+Status TranslateStatus(cudaError_t s, const char* file, int64 line,
+                       const char* expr) {
+  if (s == cudaSuccess) {
+    return Status::OK();
+  }
+  return tensorflow::errors::Internal(
+      absl::StrFormat("%s:%d: CUDA operation %s failed: %s", file, line, expr,
+                      cudaGetErrorString(s)));
+}
+
+// Macros to return or warn on CUDA/NCCL errors.  (The same macro works for both
+// NCCL and CUDA errors.)
+//
+// It's tempting to say these macros belong in an XLA header somewhere, but in
+// practice we don't do much direct-to-CUDA-API stuff outside of this file.
+#define XLA_CUDA_RETURN_IF_ERROR(expr)                                       \
+  do {                                                                       \
+    Status s = ::xla::gpu::TranslateStatus(expr, __FILE__, __LINE__, #expr); \
+    if (!s.ok()) {                                                           \
+      return s;                                                              \
+    }                                                                        \
+  } while (0)
+
+#define XLA_CUDA_WARN_IF_ERROR(expr)                                         \
+  do {                                                                       \
+    Status s = ::xla::gpu::TranslateStatus(expr, __FILE__, __LINE__, #expr); \
+    if (!s.ok()) {                                                           \
+      LOG(ERROR) << s.ToString();                                            \
+    }                                                                        \
+  } while (0)
+
+// RAII class owning a ncclComm_t, ensuring it doesn't leak.
+class NcclComm {
+ public:
+  explicit NcclComm(ncclComm_t comm) : comm_(comm) {}
+
+  // Movable, but not copyable.
+  NcclComm(NcclComm&& c) noexcept : comm_(c.comm_) { c.comm_.reset(); }
+  NcclComm& operator=(NcclComm&& c) noexcept {
+    comm_ = c.comm_;
+    c.comm_.reset();
+    return *this;
+  }
+  NcclComm(const NcclComm&) = delete;
+  NcclComm& operator=(const NcclComm&) = delete;
+
+  ~NcclComm() {
+    if (comm_.has_value() && *comm_ != nullptr) {
+      VLOG(3) << absl::StreamFormat("Destroying comm %p", *comm_);
+      XLA_CUDA_WARN_IF_ERROR(ncclCommDestroy(*comm_));
+    }
+  }
+
+  ncclComm_t comm() { return *comm_; }
+
+ private:
+  absl::optional<ncclComm_t> comm_;
+};
+
+// Key that identifies a particular Rendezvous object in our global hashtable.
+// This determines which calls to ExecuteOnStream communicate with each other.
+// The rules are as follows.
+//
+// * Only ops with the same RunId can communicate with each other. (This is the
+//   whole purpose of RunId).
+//
+// * Only ops with the same opcode can communicate with each other.  At the
+//   moment we only support kAllReduce, so we don't check for this explicitly.
+//
+// * For cross-module all-reduces (i.e. instr->all_reduce_id().has_value()),
+//   only ops with the same value for all_reduce_id() can communicate with each
+//   other.
+//
+// * For cross-replica (i.e. same-module) all-reduces (i.e.
+//   !all_reduce_id().has_value()), only ops from the same module (as identified
+//   by its unique_id()) can communicate with each other.
+//
+struct RendezvousKey {
+  enum AllReduceKind {
+    kCrossModule,
+    kCrossReplica,
+  };
+
+  explicit RendezvousKey(const RunId& run_id,
+                         const HloAllReduceInstruction* instr)
+      : run_id(run_id) {
+    std::tie(all_reduce_kind, op_id) =
+        instr->all_reduce_id().has_value()
+            ? std::make_pair(kCrossModule, instr->all_reduce_id().value())
+            : std::make_pair(
+                  kCrossReplica,
+                  static_cast<int64>(instr->GetModule()->unique_id()));
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const RendezvousKey& k) {
+    return H::combine(std::move(h), k.run_id,
+                      static_cast<int>(k.all_reduce_kind), k.op_id);
+  }
+  friend bool operator==(const RendezvousKey& a, const RendezvousKey& b) {
+    return a.run_id == b.run_id && a.all_reduce_kind == b.all_reduce_kind &&
+           a.op_id == b.op_id;
+  }
+  friend bool operator!=(const RendezvousKey& a, const RendezvousKey& b) {
+    return !(a == b);
+  }
+
+  string ToString() const {
+    return absl::StrFormat(
+        "RendezvousKey{run_id=%s, all_reduce_kind=%d, op_id=%d}",
+        run_id.ToString(), static_cast<int>(all_reduce_kind), op_id);
+  }
+
+  RunId run_id;
+  AllReduceKind all_reduce_kind;
+  int64 op_id;
+};
+
+// Encapsulates parameters to Rendezvous::SubmitParticipant.
+struct ParticipantData {
+  explicit ParticipantData(RendezvousKey rendezvous_key)
+      : rendezvous_key(rendezvous_key) {}
+
+  int64 replica_count;  // Number of GPUs particiating in the AllReduce.
   int64 element_count;
   int64 device_ordinal;
-  int64 generation_counter;
+  RendezvousKey rendezvous_key;
 
   // TODO(b/125951860): We should vet that we're buffer allocating such that
   // source_buffer == destination_buffer if that avoids a NCCL copy (will depend
@@ -60,333 +219,301 @@ struct ParticipantData {
   se::DeviceMemoryBase destination_data;
   se::Stream* stream;
 
-  NcclAllReduceThunk* originator;
-
   string ToString() const {
     return absl::StrFormat(
         "ParticipantData{replica_count=%d, element_count=%d, "
-        "device_ordinal=%d, generation_counter=%d, stream=%p, originator=%p}",
-        replica_count, element_count, device_ordinal, generation_counter,
-        stream, originator);
+        "rendezvous_key=%s, device_ordinal=%d, stream=%p}",
+        replica_count, element_count, rendezvous_key.ToString(), device_ordinal,
+        stream);
   }
 };
 
-// Class that gets instantiated as a singleton in GetGlobalRendezvous() to
-// coordinate participating threads in performing an AllReduce operation.
-//
-// This manager is responsible for establishing communication channels and
-// ultimately enqueueing the NCCL library operation onto the participating
-// streams.
-//
-// Implementation note: We make an effort to avoid initializing nccl
-// communciation channels too often, as this is expensive.
-//
-// Ideally, we'd set up a nccl channel between each pair of devices that needs
-// to communicate, and close each channel when the GPUs won't be communicating
-// again "for a long time" (because channels hold memory on the GPU).  As a
-// simplification to this ideal, we adopt the following policy.
-//
-//  - We maintain a set of GPUs that are "actively participating" in
-//    cross-device communications.  That set of GPUs is always connected as a
-//    clique, using ncclCommInitAll.
-//
-//  - When a NcclAllReduceThunk touches a new GPU, we tear down the old clique
-//    and build a new, bigger one.
-//
-//  - All GPUs ever touched by a thunk are considered "actively in use" by that
-//    thunk until the thunk is destroyed.  Destroying the thunk decrements the
-//    refcount of the GPUs it's touched, and if that refcount goes to 0
-//    (meaning, some GPUs are no longer in use by any thunk), we tear down the
-//    clique and build a new, smaller one.
-//
-// This approximation is justified because:
-//
-//  - Currently the only collective operation we support is AllReduce, which
-//    requires a clique.  When we support point-to-point operations, we may not
-//    want to build a communication clique.
-//
-//  - Tearing down and creating a new thunk is tantamount to running the whole
-//    XLA:GPU compiler.  This is expensive, so shouldn't happen "too often" to
-//    cause thrashing here.
-//
-//  - XLA executables already keep resources on the GPU tied to the lifetime of
-//    the executable (e.g. constants stored in GPU memory), so tying the
-//    lifetime of the nccl communication channels to the lifetime of the
-//    executable is consistent.
-class GlobalRendezvousManager {
- public:
-  // The GpuExecutable-executing threads call this in order to a) establish the
-  // all-reduce rendezvous and b) enqueue the AllReduce operation on the caller
-  // thread's associated stream (given in "participant").
-  //
-  // Implementation note: since the rendezvous we're creating here is global, we
-  // try to be paranoid about the fact that the *correct* one is happening.  In
-  // an ideal world we'd have some StreamExecutor se::Platform level construct
-  // that we could use for cross-device networking primitives (e.g. via a
-  // NetworkSupport interface) that could be shared between TensorFlow and XLA,
-  // but this is a reasonable stopgap measure to get multi-GPU-replica up and
-  // running properly for single-host, single-concurrent-XLA-module usage.
-  Status SubmitParticipant(ParticipantData participant);
-
-  // Returns the current generation number of AllReduce operations.
-  // (Currently one AllReduce operation occurs per generation.)
-  int64 GetCurrentGeneration() {
-    tensorflow::mutex_lock lock(mutex_);
-    return current_generation_;
+// Key for looking up a particular NCCL clique.  This is just a set of unique
+// device ordinals (i.e. GPU IDs).
+struct NcclCliqueKey {
+  explicit NcclCliqueKey(absl::Span<const int64> devices)
+      : devices(devices.begin(), devices.end()) {
+    absl::c_sort(this->devices);
+    CHECK(absl::c_adjacent_find(devices) == devices.end())
+        << "Duplicate devices are not allowed: "
+        << absl::StrJoin(devices, ", ");
   }
 
-  // Increments the refcount of a GPU in our accounting of which devices are
-  // "actively participating" in cross-device operations.
-  //
-  // This doesn't actually do anything other than increment the refcount.  If
-  // the GPU added here is novel, we'll rebuild the nccl communication clique
-  // when we actually go do the communication.
-  void AddrefParticipatingDevice(int device_ordinal);
+  template <typename H>
+  friend H AbslHashValue(H h, const NcclCliqueKey& k) {
+    return H::combine(std::move(h), k.devices);
+  }
+  friend bool operator==(const NcclCliqueKey& a, const NcclCliqueKey& b) {
+    return a.devices == b.devices;
+  }
 
-  // Decrements the refcount of a set of GPUs in our accounting of which devices
-  // are "actively participating" in cross-device operations.
-  //
-  // If one or more GPUs' refcounts to go 0, we immediately destroy the whole
-  // nccl communication clique.  We'll rebuild a new, smaller clique the next
-  // time it's used.
-  void DecrefParticipatingDevices(absl::Span<const int> device_ordinals);
+  std::vector<int64> devices;
+};
 
-  // Gets the set of devices that have a NCCL channel currently open.  This is
-  // primarily for testing.
-  absl::flat_hash_set<int> DevicesWithOpenNcclChannels() const {
-    absl::flat_hash_set<int> devices;
-    tensorflow::mutex_lock lock(mutex_);
-    for (const auto& kv : comms_) {
-      devices.insert(kv.first);
-    }
-    return devices;
+// Owns a clique of NCCL comms which can be used for collective operations among
+// a particular set of GPUs.
+//
+// You must ensure this is not in an error state (i.e. status() is OK) before
+// touching any other methods.
+//
+// (Usually allowing objects to be in a constructed-but-uninitialized state is
+// an antipattern.  We do it here because it allows us to have a
+// RefcountingHashMap which contains and automatically constructs NcclCliques.
+// This greatly simplifies the rest of this file.)
+//
+// Note that if you want to do a collective operation among a subset of these
+// GPUs, you'll need a different clique.
+class NcclClique {
+ public:
+  explicit NcclClique(absl::Span<const int64> devices)
+      : devices_(devices.begin(), devices.end()) {
+    absl::c_sort(devices_);
+    status_ = Init();
+  }
+
+  Status status() { return status_; }
+
+  absl::Span<const int64> devices() {
+    TF_CHECK_OK(status_);
+    return devices_;
+  }
+  ncclComm_t comm(int64 device) {
+    int64 idx = std::distance(devices_.begin(), absl::c_find(devices_, device));
+    return comms_.at(idx).comm();
+  }
+
+  // These methods let you acquire exclusive access to a NCCL clique, ensuring
+  // no other NCCL operations are taking place on the clique's comms.
+  //
+  // We disable thread-safety analysis because in common use, only the primary
+  // thread in a Rendezvous acquires this lock, and that makes thread-safety
+  // analysis unhappy.  Tread carefully, you are playing with fire.
+  void Lock() NO_THREAD_SAFETY_ANALYSIS {
+    TF_CHECK_OK(status_);
+    mu_->lock();
+  }
+  void Unlock() NO_THREAD_SAFETY_ANALYSIS {
+    TF_CHECK_OK(status_);
+    mu_->unlock();
   }
 
  private:
-  // Destroys the current nccl communication clique and builds a new one
-  // connecting the given devices.
-  Status ReinitializeNcclClique(const absl::flat_hash_set<int>& device_ordinals)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Status Init() {
+    VLOG(3) << absl::StreamFormat(
+        "Initializing nccl comms for participant devices {%s}",
+        absl::StrJoin(devices_, ", "));
 
-  // Called when all necessary participants are present, the functionality
-  // that's implemented by all executing threads lives in here.
-  Status DoAllReduce(ParticipantData data, ncclComm_t comm);
+    // Restore CUDA device after running this.  XLA shouldn't care, but maybe
+    // another consumer does.
+    int initial_cuda_device;
+    XLA_CUDA_RETURN_IF_ERROR(cudaGetDevice(&initial_cuda_device));
+    auto cuda_device_restorer = MakeCleanup(
+        [&] { XLA_CUDA_WARN_IF_ERROR(cudaSetDevice(initial_cuda_device)); });
 
-  // Puts all state back into a "reset" state for the next generation of
-  // AllReduce requests.
-  void DeinitializeGeneration() EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
-    participants_.clear();
-    current_generation_++;
-    initialized_ = false;
-    done_ = absl::nullopt;
+    // When using ncclGroupStart/End it seems that the ncclComm_t's are not
+    // populated until the End() call.  This unfortunately makes error handling
+    // tricky.
+    std::vector<ncclComm_t> raw_comms(devices_.size(), nullptr);
+    ncclUniqueId nccl_id;
+    XLA_CUDA_RETURN_IF_ERROR(ncclGetUniqueId(&nccl_id));
+    XLA_CUDA_RETURN_IF_ERROR(ncclGroupStart());
+    Status status = [&] {
+      for (int i = 0; i < devices_.size(); ++i) {
+        XLA_CUDA_RETURN_IF_ERROR(cudaSetDevice(devices_[i]));
+        XLA_CUDA_RETURN_IF_ERROR(
+            ncclCommInitRank(&raw_comms[i], devices_.size(), nccl_id, i));
+      }
+      return Status::OK();
+    }();
+    // Always call ncclGroupEnd().
+    XLA_CUDA_RETURN_IF_ERROR(ncclGroupEnd());
+
+    // Populate comms_ from the raw comms we created above.  If we encountered
+    // an error above we'll later clear comms_ thus destroying any raw comms
+    // that were created before the error.
+    for (int i = 0; i < devices_.size(); ++i) {
+      VLOG(3) << absl::StreamFormat("Device %d assigned ncclComm %p",
+                                    devices_[i], raw_comms[i]);
+      CHECK(raw_comms[i] != nullptr || !status.ok());
+      comms_.emplace_back(raw_comms[i]);
+    }
+    if (!status.ok()) {
+      comms_.clear();
+    }
+
+    return status;
   }
 
-  mutable tensorflow::mutex mutex_;
-  tensorflow::condition_variable all_participants_present_;
-  tensorflow::condition_variable deinitialized_;
+  Status status_;
+  std::vector<int64> devices_;
+  std::vector<NcclComm> comms_;
 
-  Status initialize_status_ GUARDED_BY(mutex_);
-  std::vector<ParticipantData> participants_ GUARDED_BY(mutex_);
-  int64 current_generation_ GUARDED_BY(mutex_) = 0;
-  bool initialized_ GUARDED_BY(mutex_) = false;
-
-  struct Comm {
-    explicit Comm(ncclComm_t nccl_comm) : nccl_comm(nccl_comm) {}
-
-    // Movable, but not copyable.
-    Comm(Comm&& c) : nccl_comm(c.nccl_comm) { c.nccl_comm.reset(); }
-    Comm& operator=(Comm&& c) {
-      nccl_comm = c.nccl_comm;
-      c.nccl_comm.reset();
-      return *this;
-    }
-    Comm(const Comm&) = delete;
-    Comm& operator=(const Comm&) = delete;
-
-    absl::optional<ncclComm_t> nccl_comm;
-
-    ~Comm() {
-      if (nccl_comm.has_value()) {
-        VLOG(3) << absl::StreamFormat("Destroying comm %p", *nccl_comm);
-        ncclCommDestroy(*nccl_comm);
-      }
-    }
-  };
-  // Communication handles for our NCCL clique.  Key is device ordinal.
-  absl::flat_hash_map<int, Comm> comms_ GUARDED_BY(mutex_);
-
-  // Refcounts of which devices are "actively participating" in all-reduces.
-  // These devices don't necessarily have an open comm, but the next time we run
-  // an operation, we'll create a NCCL clique between all of them.
-  absl::flat_hash_map<int, int64> device_refcounts_ GUARDED_BY(mutex_);
-
-  // The participating threads wait for this to count down in order to know we
-  // can begin the teardown process.
-  absl::optional<tensorflow::BlockingCounter> done_;
+  // This mutex is in a unique_ptr so NcclClique can be movable.
+  std::unique_ptr<tensorflow::mutex> mu_ =
+      absl::make_unique<tensorflow::mutex>();
 };
 
-Status GlobalRendezvousManager::SubmitParticipant(ParticipantData participant) {
-  auto all_participants_present = [this, &participant]()
-                                      EXCLUSIVE_LOCKS_REQUIRED(mutex_) -> bool {
-    return participants_.size() >= participant.replica_count;
-  };
+// Global cache of NCCL cliques.  An entry in this map is kept alive as long as
+// there's a reference to it somewhere.  A Thunk holds a reference to each
+// Clique it's ever used.
+//
+// A consequence of the fact that this is process-global is that we'll only ever
+// have one clique alive for a given set of GPUs.  This means that a process
+// will never do two collective operations concurrently on the same set of GPUs.
+RefcountingHashMap<NcclCliqueKey, NcclClique>& GlobalNcclCliqueMap() {
+  static auto& m = *new RefcountingHashMap<NcclCliqueKey, NcclClique>(
+      [](const NcclCliqueKey& key) {
+        return absl::make_unique<NcclClique>(key.devices);
+      });
+  return m;
+}
 
-  {
-    tensorflow::mutex_lock lock(mutex_);
+// The set of threads that want to do a collective op together all pick the same
+// Rendezvous object out of the global cache and call SubmitParticipant.
+//
+// The Rendezvous instance handles waiting for all threads to join, ensuring
+// that a clique exists for the desired set of GPUs, etc.
+//
+// Rendezvous objects can only be used once.
+class Rendezvous {
+ public:
+  Rendezvous() = default;
 
-    // Spot check for consistent replica counts among submitting threads.
-    if (!participants_.empty() &&
-        (participants_.back().replica_count != participant.replica_count ||
-         participants_.back().originator != participant.originator)) {
-      return InvalidArgument(
-          "Running two XLA modules with AllReduces in parallel is not "
-          "supported. It is possible this is due to a bug where were try to "
-          "run two different AllReduces from the same module at once. "
-          "(Attempted a rendezvous with a different replica count from other "
-          "participants; existing: %s; submitted: %s)",
-          participants_.back().ToString(), participant.ToString());
-    }
-    participants_.push_back(participant);
+  // Runs the all-reduce on the given thread.  If successful, returns
+  //  - a handle to the clique that was used, so that the caller may keep the
+  //    clique alive if it chooses.
+  //  - a BlockingCounter initialized to the number of participants, so that
+  //    the caller can coordinate with the participants one last time if it
+  //    chooses.  This is useful for coordinating destruction of the Rendezvous.
+  StatusOr<std::pair<std::shared_ptr<NcclClique>,
+                     std::shared_ptr<tensorflow::BlockingCounter>>>
+  SubmitParticipant(ParticipantData participant);
 
-    if (all_participants_present()) {
-      all_participants_present_.notify_all();
-    }
-  }
+ private:
+  Status DoAllReduce(ParticipantData participant, ncclComm_t comm);
 
+  tensorflow::mutex mu_;
+  tensorflow::condition_variable all_participants_present_;
+
+  bool initialized_ GUARDED_BY(mu_) = false;
+  absl::optional<tensorflow::BlockingCounter> done_;
+  std::vector<ParticipantData> participants_ GUARDED_BY(mu_);
+
+  // BlockingCounter returned by SubmitParticipant.  Initialized by the primary
+  // thread.
+  std::shared_ptr<tensorflow::BlockingCounter> returned_blocking_counter_;
+};
+
+// Global map of Rendezvous objects.  A thread participating in a collective op
+// looks up its Rendezvous in this map to find the other threads that it's
+// participating with.
+//
+// Rendezvous objects are one-time use, so they're removed from this map once
+// we're through with them.
+RefcountingHashMap<RendezvousKey, Rendezvous>& GlobalRendezvousMap() {
+  static auto& m = *new RefcountingHashMap<RendezvousKey, Rendezvous>();
+  return m;
+}
+
+StatusOr<std::pair<std::shared_ptr<NcclClique>,
+                   std::shared_ptr<tensorflow::BlockingCounter>>>
+Rendezvous::SubmitParticipant(ParticipantData participant) {
   // We pull into our thread a) the communication handle and b) whether we're
   // the "primary" thread for this rendezvous -- the "primary" thread has some
   // additional responsibilities for setup/teardown.
   ncclComm_t comm;
   bool primary;
+  std::shared_ptr<NcclClique> clique;
+
+  // Releases the lock on the clique (held only by the primary thread).
+  Cleanup<std::function<void()>> clique_lock_releaser;
 
   {
-    tensorflow::mutex_lock lock(mutex_);
-    while (!all_participants_present()) {
-      // Once all the participants have arrived, all participating threads will
-      // cross this barrier, though only (the first) one will be the "primary".
+    tensorflow::mutex_lock lock(mu_);
+    CHECK(!initialized_);
+
+    // Spot check for consistent replica counts among submitting threads.
+    if (!participants_.empty() &&
+        (participants_.back().replica_count != participant.replica_count ||
+         participants_.back().element_count != participant.element_count ||
+         participants_.back().rendezvous_key != participant.rendezvous_key)) {
+      return InvalidArgument(
+          "Mismatch among all-reduce participants.  Expected same "
+          "replica-count, element-count, and rendezvous-key but were %s and %s",
+          participants_.back().ToString(), participant.ToString());
+    }
+    participants_.push_back(participant);
+
+    // Wait here for all participants to arrive.
+    while (participants_.size() < participant.replica_count) {
       all_participants_present_.wait(lock);
     }
+    if (participants_.size() == participant.replica_count) {
+      all_participants_present_.notify_all();
+    }
 
-    // Somebody will be the first -- that thread has some additional
-    // responsibilities.
+    // The first thread to get here has additional responsibilities, such as
+    // ensuring that there's a NCCL clique available for us to use.
     primary = !initialized_;
 
-    CHECK_EQ(participant.generation_counter, current_generation_);
+    // Look up or create the NCCL clique for this set of devices.
+    std::vector<int64> devices;
+    for (const auto& p : participants_) {
+      devices.push_back(p.device_ordinal);
+    }
+    clique = GlobalNcclCliqueMap()[NcclCliqueKey(devices)];
 
-    // Bump the generation counter so the other threads know we've completed the
-    // global rendezvous and have set up the AllReduce.
     if (primary) {
       VLOG(3) << "Primary initializing accounting data.";
       initialized_ = true;
       done_.emplace(participant.replica_count);
+      returned_blocking_counter_ =
+          std::make_shared<tensorflow::BlockingCounter>(
+              participant.replica_count);
 
-      // Check if all participants_ are in comms_.  If not, we will rebuild the
-      // clique to include them.  (This can't be spelled using absl::c_any_of
-      // because it needs to touch comms_ and tensorflow::mutex lacks an
-      // AssertHeld() function that would let us assert that the lambda is run
-      // while holding the lock.)
-      bool new_devices_found = false;
-      for (const auto& p : participants_) {
-        if (!comms_.contains(p.device_ordinal)) {
-          new_devices_found = true;
-          break;
-        }
-      }
-
-      if (new_devices_found) {
-        absl::flat_hash_set<int> new_clique_device_ordinals;
-        for (const auto& kv : comms_) {
-          new_clique_device_ordinals.insert(kv.first);
-        }
-        for (const auto& p : participants_) {
-          new_clique_device_ordinals.insert(p.device_ordinal);
-        }
-
-        initialize_status_ = ReinitializeNcclClique(new_clique_device_ordinals);
-        VLOG(3) << "Done initializing communication channels; status: "
-                << initialize_status_;
-        if (!initialize_status_.ok()) {
-          DeinitializeGeneration();
-        }
-      }
+      // Acquire exclusive access to the NCCL clique itself so that two
+      // unrelated collective operations won't try to use the clique
+      // concurrently.
+      clique->Lock();
+      clique_lock_releaser = MakeCleanup([clique] { clique->Unlock(); });
     }
 
-    if (!initialize_status_.ok()) {
-      // TODO(b/125951860): If this fails once, it will fail forever.
-      return initialize_status_;
+    if (!clique->status().ok()) {
+      VLOG(1)
+          << "SubmitParticipant failing because clique failed to initialize: "
+          << clique->status().ToString();
+      return clique->status();
     }
 
-    comm = *comms_.at(participant.device_ordinal).nccl_comm;
+    comm = clique->comm(participant.device_ordinal);
 
     // Drop the lock at the end of scope so other participants may enter.
   }
 
   VLOG(3) << "Performing all reduce from device ordinal: "
           << participant.device_ordinal;
-
   Status all_reduce_status = DoAllReduce(participant, comm);
-
-  VLOG(3) << "Waiting for all participants to complete enqueue.";
+  VLOG(3) << "This thread done with all-reduce op.";
 
   done_->DecrementCount();
 
+  // The primary owns the lock on the NCCL clique.  Hold it until all threads
+  // are done.  (We'll release it when we return from this function.)
   if (primary) {
-    // Primary thread clears out the AllReduce state when everybody is done to
-    // make it clean-slate for any subsequent AllReduce request (e.g. number of
-    // replicas may change in the next request).
-    //
-    // Note surrounding TODOs for only reinitializing this when the replica
-    // count / participants actually change -- lots of "playing it safe"
-    // happening in this first cut.
+    VLOG(3)
+        << "Primary waiting for all participants to complete all-reduce op.";
     done_->Wait();
-    VLOG(3) << "All participants completed enqueue.";
-    VLOG(3) << "Primary thread clearing.";
-    tensorflow::mutex_lock lock(mutex_);
-    DeinitializeGeneration();
-    VLOG(3) << "Generation is now: " << current_generation_;
-    deinitialized_.notify_all();
-  } else {
-    VLOG(3) << "Waiting to deinitialize.";
-    tensorflow::mutex_lock lock(mutex_);
-    while (initialized_) {
-      deinitialized_.wait(lock);
-    }
+    VLOG(3) << "All participants completed all-reduce op.";
   }
 
   VLOG(3) << "Returning status: " << all_reduce_status;
-  return all_reduce_status;
+  if (!all_reduce_status.ok()) {
+    return all_reduce_status;
+  }
+  return std::make_pair(clique, returned_blocking_counter_);
 }
 
-Status GlobalRendezvousManager::ReinitializeNcclClique(
-    const absl::flat_hash_set<int>& device_ordinals) {
-  comms_.clear();
-
-  std::vector<int> ordinals_vec(device_ordinals.begin(), device_ordinals.end());
-  std::vector<ncclComm_t> comm_vec;
-  comm_vec.resize(device_ordinals.size());
-
-  VLOG(3) << absl::StreamFormat(
-      "Initializing nccl comms for participant devices {%s}",
-      absl::StrJoin(ordinals_vec, ", "));
-  ncclResult_t result = ncclCommInitAll(comm_vec.data(), comm_vec.size(),
-                                        /*devlist=*/ordinals_vec.data());
-  if (result != ncclSuccess) {
-    return InternalError(
-        "Failed to initialize NCCL communication channels for %d participants: "
-        "%s",
-        ordinals_vec.size(), ncclGetErrorString(result));
-  }
-
-  for (int64 i = 0; i < ordinals_vec.size(); ++i) {
-    VLOG(3) << absl::StreamFormat("Device ordinal %d assigned ncclComm %p",
-                                  ordinals_vec[i], comm_vec[i]);
-    CHECK(comms_.emplace(ordinals_vec[i], Comm{comm_vec[i]}).second);
-  }
-  return Status::OK();
-}
-
-Status GlobalRendezvousManager::DoAllReduce(ParticipantData participant,
-                                            ncclComm_t comm) {
+Status Rendezvous::DoAllReduce(ParticipantData participant, ncclComm_t comm) {
   se::StreamExecutor* executor = participant.stream->parent();
   se::cuda::ScopedActivateExecutorContext scoped_context(executor);
   cudaStream_t* cu_stream = reinterpret_cast<cudaStream_t*>(
@@ -400,14 +527,12 @@ Status GlobalRendezvousManager::DoAllReduce(ParticipantData participant,
       "datatype=ncclFloat, op=ncclSum, comm=%p, stream=%p)",
       send_buffer, recv_buffer, participant.element_count,
       static_cast<const void*>(comm), cu_stream);
-  ncclResult_t result = ncclAllReduce(send_buffer, recv_buffer,
-                                      /*count=*/participant.element_count,
-                                      /*datatype=*/ncclFloat,
-                                      /*op=*/ncclSum,
-                                      /*comm=*/comm,
-                                      /*stream=*/*cu_stream);
-  TF_RET_CHECK(ncclSuccess == result)
-      << "Failed to perform all-reduce: " << ncclGetErrorString(result);
+  XLA_CUDA_RETURN_IF_ERROR(ncclAllReduce(send_buffer, recv_buffer,
+                                         /*count=*/participant.element_count,
+                                         /*datatype=*/ncclFloat,
+                                         /*op=*/ncclSum,
+                                         /*comm=*/comm,
+                                         /*stream=*/*cu_stream));
 
   VLOG(3) << "Done performing all reduce for ordinal: "
           << participant.device_ordinal;
@@ -415,95 +540,100 @@ Status GlobalRendezvousManager::DoAllReduce(ParticipantData participant,
   return Status::OK();
 }
 
-void GlobalRendezvousManager::AddrefParticipatingDevice(int device_ordinal) {
-  // Addref'ing a device doesn't do anything other than increment its refcount.
-  // We'll update our nccl clique if necessary during the next call to
-  // SubmitParticipant.
-  tensorflow::mutex_lock lock(mutex_);
-  device_refcounts_[device_ordinal]++;
-}
-
-void GlobalRendezvousManager::DecrefParticipatingDevices(
-    absl::Span<const int> device_ordinals) {
-  // Decref'ing devices causes us to destroy the nccl clique if any devices were
-  // removed due to having refcount 0.  We'll rebuild the new, smaller clique
-  // during the next call to SubmitParticipant.
-  tensorflow::mutex_lock lock(mutex_);
-  bool removed_device = false;
-  for (int device_ordinal : device_ordinals) {
-    auto it = device_refcounts_.find(device_ordinal);
-    CHECK(it != device_refcounts_.end());
-    it->second--;
-    if (it->second == 0) {
-      device_refcounts_.erase(it);
-      removed_device = true;
-    }
-  }
-
-  if (removed_device) {
-    comms_.clear();
-  }
-}
-
-static GlobalRendezvousManager* GetGlobalRendezvous() {
-  static auto* manager = new GlobalRendezvousManager;
-  return manager;
-}
-
 }  // namespace
 
+// Extra data stored in NcclAllReduceThunk that we didn't want to expose in the
+// header.  In particular, this stores the thunk's cache of all NcclCliques it's
+// ever used.  This causes those cliques to stay alive as long as the thunk
+// lives, which is how we avoid expensive reinitialization of NCCL cliques.
+struct NcclAllReduceThunk::AuxData {
+  tensorflow::mutex mu;
+  absl::flat_hash_set<std::shared_ptr<NcclClique>> cliques GUARDED_BY(mu);
+};
+
 /*static*/ absl::flat_hash_set<int>
 NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
-  return GetGlobalRendezvous()->DevicesWithOpenNcclChannels();
+  absl::flat_hash_set<int> devices;
+  GlobalNcclCliqueMap().ForEach(
+      [&](const NcclCliqueKey& k, const std::shared_ptr<NcclClique>&) {
+        devices.insert(k.devices.begin(), k.devices.end());
+      });
+  return devices;
 }
 
+NcclAllReduceThunk::NcclAllReduceThunk(
+    int64 replica_count, int64 element_count,
+    const BufferAllocation::Slice& source_buffer,
+    const BufferAllocation::Slice& destination_buffer,
+    const HloInstruction* all_reduce)
+    : Thunk(Thunk::kNcclAllReduce, all_reduce),
+      replica_count_(replica_count),
+      element_count_(element_count),
+      source_buffer_(source_buffer),
+      destination_buffer_(destination_buffer),
+      aux_data_(absl::make_unique<AuxData>()) {}
+
 Status NcclAllReduceThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
-  auto* global_rendezvous = GetGlobalRendezvous();
+    const RunId& run_id, HloExecutionProfiler* profiler) {
+  // Find or create the rendezvous for this collective operation.
+  RendezvousKey rendezvous_key(
+      run_id, Cast<HloAllReduceInstruction>(hlo_instruction()));
+  std::shared_ptr<Rendezvous> rendezvous =
+      GlobalRendezvousMap()[rendezvous_key];
 
-  ParticipantData participant;
+  ParticipantData participant(rendezvous_key);
   participant.replica_count = replica_count_;
   participant.element_count = element_count_;
   participant.device_ordinal = stream->parent()->device_ordinal();
-  participant.generation_counter = global_rendezvous->GetCurrentGeneration();
   participant.source_data = buffer_allocations.GetDeviceAddress(source_buffer_);
   participant.destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   participant.stream = stream;
-  participant.originator = this;
 
-  // We currently say that that all GPUs this thunk has ever touched are
-  // "actively participating" in cross-device operations, until the thunk itself
-  // is destroyed.
-  //
-  // This policy is an attempt to avoid thrashing the GPU (ncclCommInitAll is
-  // very expensive) while also freeing resources on the GPUs when we can.  The
-  // idea is, creating new thunks is tantamount to running the whole XLA:GPU
-  // compiler stack, so that shouldn't happen terribly often.
-  bool new_device;
+  // Do the operation.
+  StatusOr<std::pair<std::shared_ptr<NcclClique>,
+                     std::shared_ptr<tensorflow::BlockingCounter>>>
+      result = rendezvous->SubmitParticipant(participant);
+  if (!result.ok()) {
+    VLOG(1) << "NcclAllReduceThunk::ExecuteOnStream failed: "
+            << result.status().ToString();
+    return result.status();
+  }
+
+  std::shared_ptr<NcclClique> clique;
+  std::shared_ptr<tensorflow::BlockingCounter> blocking_counter;
+  std::tie(clique, blocking_counter) = std::move(result).ValueOrDie();
+
+  // Keep the clique we used alive for as long as this Thunk lives.  Creating
+  // new NCCL cliques is expensive, and this is how we avoid thrashing them.
   {
-    tensorflow::mutex_lock lock(mu_);
-    new_device = devices_seen_.insert(participant.device_ordinal).second;
-  }
-  if (new_device) {
-    GetGlobalRendezvous()->AddrefParticipatingDevice(
-        participant.device_ordinal);
+    tensorflow::mutex_lock lock(aux_data_->mu);
+    aux_data_->cliques.insert(std::move(clique));
   }
 
-  return GetGlobalRendezvous()->SubmitParticipant(std::move(participant));
+  // Drop our reference to the Rendezvous and wait for all other threads to do
+  // the same.  If we didn't do this, one of the threads could run past this
+  // point, reenter ExecuteOnStream for another all-reduce, and attempt to reuse
+  // the Rendezvous!
+  //
+  // An alternative way of accomplishing this goal would be to implement
+  // RefcountingHashMap::erase() and call it during SubmitParticipant.  But
+  // erase() is deceptively complex to implement correctly.
+  rendezvous.reset();
+  blocking_counter->DecrementCount();
+  blocking_counter->Wait();
+
+  return Status::OK();
 }
 
-NcclAllReduceThunk::~NcclAllReduceThunk() {
-  GetGlobalRendezvous()->DecrefParticipatingDevices(
-      std::vector<int>(devices_seen_.begin(), devices_seen_.end()));
-}
+NcclAllReduceThunk::~NcclAllReduceThunk() {}
 
 #else
 
 Status NcclAllReduceThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& run_id, HloExecutionProfiler* profiler) {
   return Unimplemented(
       "NCCL support is not available: this binary was not built with a CUDA "
       "compiler, which is necessary to build the NCCL source library.");
@@ -516,7 +646,7 @@ NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
   return {};
 }
 
-#endif  // GOOGLE_CUDA
+struct NcclAllReduceThunk::AuxData {};
 
 NcclAllReduceThunk::NcclAllReduceThunk(
     int64 replica_count, int64 element_count,
@@ -529,5 +659,7 @@ NcclAllReduceThunk::NcclAllReduceThunk(
       source_buffer_(source_buffer),
       destination_buffer_(destination_buffer) {}
 
+#endif  // GOOGLE_CUDA
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index 9ff4fb187af..52ba4950565 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -50,6 +50,9 @@ class NcclAllReduceThunk : public Thunk {
 
   // TODO(b/125951860): Plumb more datatypes / reduction operators. Initial
   // implementation is simply F32 summation.
+  //
+  // TODO(b/125951860): Support all-reduces with replica groups, i.e.
+  // all-reduces that compute multiple sums across subsets of all replicas.
   NcclAllReduceThunk(int64 replica_count, int64 element_count,
                      const BufferAllocation::Slice& source_buffer,
                      const BufferAllocation::Slice& destination_buffer,
@@ -57,18 +60,21 @@ class NcclAllReduceThunk : public Thunk {
   ~NcclAllReduceThunk() override;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
+  // Extra data stored in NcclAllReduceThunk whose types we don't want exposed
+  // in the header file.  (This is mainly because the implementation of
+  // NcclAllReduceThunk is different depending on whether CUDA is enabled in the
+  // build, and we don't want to expose *that* mess in the header.)
+  struct AuxData;
+
   const int64 replica_count_;
   const int64 element_count_;
   const BufferAllocation::Slice source_buffer_;
   const BufferAllocation::Slice destination_buffer_;
-
-  tensorflow::mutex mu_;
-  // Set of GPUs that ExecuteOnStream has been called on.
-  absl::flat_hash_set<int> devices_seen_ GUARDED_BY(mu_);
+  std::unique_ptr<AuxData> aux_data_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index d8249e99d42..93fdc67d8ad 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -108,6 +108,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -157,7 +158,7 @@ string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
       "uses routines from libdevice.",
       hlo_module_config);
 
-  // GetCudaRotCandidates always inclues ".", but but if everything fails, we
+  // GetCudaRootCandidates always inclues ".", but but if everything fails, we
   // return it anyway.  Better than returning the empty string.
   return ".";
 }
@@ -265,7 +266,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     HloPassPipeline pipeline("conv_canonicalization");
     pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                               /*allow_mixed_precision=*/false);
-    pipeline.AddPass<CusolverRewriter>(stream_exec, device_allocator);
+    pipeline.AddPass<CusolverRewriter>();
     pipeline.AddPass<CudnnConvRewriter>();
     pipeline.AddPass<CudnnFusedConvRewriter>();
     pipeline.AddPass<CudnnConvPaddingLegalization>();
@@ -520,7 +521,6 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
           BufferSizeBytesFunction(),
           /*color_alignment=*/
           [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
-          /*allow_input_output_aliasing=*/false,
           /*allocate_buffers_for_constants=*/true));
   DumpHloModuleIfEnabled(*module, *buffer_assignment, "after_optimizations");
 
@@ -677,8 +677,9 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
     if (inserted) {
       CHECK(!cache_value->compilation_done);
       if (!ptx.empty()) {
-        StatusOr<std::vector<uint8>> maybe_cubin = CompilePtx(
-            stream_exec, *cache_ptx, PtxCompilationOptions(hlo_module_config));
+        StatusOr<std::vector<uint8>> maybe_cubin = se::cuda::CompilePtx(
+            stream_exec->device_ordinal(), cache_ptx->c_str(),
+            PtxOptsFromConfig(hlo_module_config));
         if (maybe_cubin.ok()) {
           cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
           VLOG(2) << "Compiled PTX size:" << ptx.size()
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
index e0f3e84a4cb..527305070b7 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
@@ -30,7 +30,7 @@ OutfeedThunk::OutfeedThunk(ShapeTree<BufferAllocation::Slice> outfeed_slices,
 
 Status OutfeedThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   VLOG(2) << "Outfeeding from GPU: " << hlo_instruction()->ToString();
 
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
index 8ed89f05f0c..5e7bc7cea1a 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
@@ -39,7 +39,7 @@ class OutfeedThunk : public Thunk {
   OutfeedThunk& operator=(const OutfeedThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
index 9427a44a90c..64db95ce98a 100644
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/kernel.h"
 #include "tensorflow/stream_executor/kernel_spec.h"
@@ -272,8 +273,10 @@ StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones(
   se::StreamExecutor* executor = stream->parent();
 
   absl::Span<const uint8> compiled_ptx = {};
-  StatusOr<absl::Span<const uint8>> compiled_ptx_or = CompilePtxOrGetCached(
-      executor, redzone_checker_ptx, PtxCompilationOptions(hlo_module_config_));
+  StatusOr<absl::Span<const uint8>> compiled_ptx_or =
+      se::cuda::CompilePtxOrGetCached(executor->device_ordinal(),
+                                      redzone_checker_ptx,
+                                      PtxOptsFromConfig(hlo_module_config_));
   if (compiled_ptx_or.ok()) {
     compiled_ptx = compiled_ptx_or.ValueOrDie();
   } else {
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index 84285be70a4..2f456938d92 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -35,11 +35,11 @@ Status SequentialThunk::Initialize(const GpuExecutable& executable,
 
 Status SequentialThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& run_id, HloExecutionProfiler* profiler) {
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
   for (const auto& thunk : thunks_) {
     TF_RETURN_IF_ERROR(
-        thunk->ExecuteOnStream(buffer_allocations, stream, profiler));
+        thunk->ExecuteOnStream(buffer_allocations, stream, run_id, profiler));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 3c4de1d1a6c..e617c99c2c9 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -42,7 +42,7 @@ class SequentialThunk : public Thunk {
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index ca409fff67b..75b6f31e3dc 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -226,186 +226,11 @@ Status ExecuteKernelOnStream(const se::KernelBase& kernel,
   return Status::OK();
 }
 
-// Prints a warning if the ptxas at ptxas_path has known bugs.
-//
-// Only prints a warning the first time it's called for a particular value of
-// ptxas_path.
-//
-// Locks on entry.
-void WarnIfBadPtxasVersion(const string& ptxas_path) {
-  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-  static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) =
-      new std::unordered_set<string>();
-
-  tensorflow::mutex_lock lock(mu);
-  if (!seen_ptxas_paths->insert(ptxas_path).second) {
-    // Already checked this ptx binary, nothing to do.
-    return;
-  }
-
-  tensorflow::SubProcess ptxas;
-  ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
-  ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
-  if (!ptxas.Start()) {
-    LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
-    return;
-  }
-
-  string out;
-  int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
-                                    /*stderr_output=*/nullptr);
-  if (exit_code != 0) {
-    LOG(WARNING) << "Running " << ptxas_path << " --version returned "
-                 << exit_code;
-    return;
-  }
-
-  int64 vmaj, vmin, vdot;
-  string vmaj_str, vmin_str, vdot_str;
-  if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
-                         &vmin_str, &vdot_str) ||
-      !absl::SimpleAtoi(vmaj_str, &vmaj) ||
-      !absl::SimpleAtoi(vmin_str, &vmin) ||
-      !absl::SimpleAtoi(vdot_str, &vdot)) {
-    LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
-                 << " --version:\n"
-                 << out;
-    return;
-  }
-
-  // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
-  // PTX 6.0.  An older ptxas will just fail to compile any of our code.
-  //
-  // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
-  // address calculations with large offsets (e.g. "load ptr + large_constant"),
-  // b/70245379.
-  //
-  // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
-  // that appears related to address calculations, b/111107644.  ptxas 9.2.88
-  // appears to work, as far as we can tell.
-  if (vmaj < 9) {
-    LOG(ERROR)
-        << "You are using ptxas 8.x, but XLA requires ptxas 9.x (and strongly "
-           "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
-           "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
-           "binary is sufficient.";
-  } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
-    LOG(WARNING)
-        << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
-        << vdot
-        << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
-           "miscompile XLA code, leading to incorrect results or "
-           "invalid-address errors.\n\nYou do not need to update to CUDA "
-           "9.2.88; cherry-picking the ptxas binary is sufficient.";
-  }
-}
-
-StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
-    se::StreamExecutor* executor, absl::string_view ptx,
-    PtxCompilationOptions compilation_options) {
-  using PtxCacheKey = std::tuple<se::StreamExecutor*, std::string,
-                                 PtxCompilationOptions::PtxOptionsTuple>;
-  static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
-  static auto& ptx_cache GUARDED_BY(ptx_cache_mutex) =
-      *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>();
-
-  tensorflow::mutex_lock lock(ptx_cache_mutex);
-  PtxCacheKey cache_key{executor, std::string(ptx),
-                        compilation_options.ToTuple()};
-  auto it = ptx_cache.find(cache_key);
-  if (it == ptx_cache.end()) {
-    TF_ASSIGN_OR_RETURN(std::vector<uint8> compiled,
-                        CompilePtx(executor, ptx, compilation_options));
-    it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
-  }
-
-  CHECK(it != ptx_cache.end());
-  const std::vector<uint8>& compiled = it->second;
-  return absl::MakeSpan(compiled);
-}
-
-StatusOr<std::vector<uint8>> CompilePtx(
-    se::StreamExecutor* stream_exec, absl::string_view ptx,
-    PtxCompilationOptions compile_ptx_options) {
-  int cc_major, cc_minor;
-  if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                                   &cc_minor)) {
-    LOG(WARNING)
-        << "Couldn't get compute capability for device; assuming sm_20.";
-    cc_major = 2;
-    cc_minor = 0;
-  }
-
-  tensorflow::profiler::TraceMe activity(
-      "Compile PTX", tensorflow::profiler::TraceMeLevel::kInfo);
-  auto env = tensorflow::Env::Default();
-  string ptxas_path;
-  for (const string& cuda_root : tensorflow::CandidateCudaRoots(
-           /*preferred_location=*/compile_ptx_options.xla_gpu_cuda_data_dir)) {
-    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
-    VLOG(2) << "Looking for ptxas at " << ptxas_path;
-    if (env->FileExists(ptxas_path).ok()) {
-      break;
-    }
-  }
-  TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
-  VLOG(2) << "Using ptxas at " << ptxas_path;
-
-  WarnIfBadPtxasVersion(ptxas_path);
-
-  // Write ptx into a temporary file.
-  string ptx_path;
-  if (!env->LocalTempFilename(&ptx_path)) {
-    return InternalError("couldn't get temp PTX file name");
-  }
-  auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
-    TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
-  });
-
-  TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_path, ptx));
-  VLOG(2) << "ptx written to: " << ptx_path;
-
-  // Invoke ptxas and collect its output.
-  string cubin_path;
-  if (!env->LocalTempFilename(&cubin_path)) {
-    return InternalError("couldn't get temp CUBIN file name");
-  }
-  auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
-    // CUBIN file may never be created, so the failure to delete it should not
-    // produce TF error.
-    tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
-  });
-  tensorflow::SubProcess ptxas_info_dumper;
-  std::vector<string> ptxas_args = {
-      ptxas_path, ptx_path, "-o", cubin_path,
-      absl::StrCat("-arch=sm_", cc_major, cc_minor)};
-  if (VLOG_IS_ON(2)) {
-    ptxas_args.push_back("-v");
-  }
-  if (compile_ptx_options.xla_gpu_disable_ptxas_optimizations) {
-    ptxas_args.push_back("-O0");
-  }
-  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
-  ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
-                                     tensorflow::ACTION_PIPE);
-  if (!ptxas_info_dumper.Start()) {
-    return InternalError("Failed to launch ptxas");
-  }
-  string stderr_output;
-  int exit_status = ptxas_info_dumper.Communicate(
-      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
-  XLA_LOG_LINES(tensorflow::INFO, stderr_output);
-  if (exit_status != 0) {
-    return InternalError("ptxas exited with non-zero error code %d",
-                         exit_status);
-  }
-
-  // Read in the result of compilation and return it as a byte vector.
-  string cubin;
-  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
-                                                  cubin_path, &cubin));
-  std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
-  return cubin_vector;
+se::cuda::PtxCompilationOptions PtxOptsFromConfig(
+    const HloModuleConfig& hlo_module_config) {
+  return se::cuda::PtxCompilationOptions(
+      hlo_module_config.debug_options().xla_gpu_disable_ptxas_optimizations(),
+      hlo_module_config.debug_options().xla_gpu_cuda_data_dir());
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 06ac7dca634..483ab210558 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
 #include "tensorflow/stream_executor/kernel_spec.h"
 
 // Helper functions for interacting with StreamExecutor.
@@ -103,47 +104,9 @@ Status ExecuteKernelOnStream(const se::KernelBase& kernel,
                              int64 threads_per_block, int64 block_count,
                              se::Stream* stream);
 
-// Options for compiling with PTX.
-struct PtxCompilationOptions {
-  bool xla_gpu_disable_ptxas_optimizations;
-  std::string xla_gpu_cuda_data_dir;
-
-  using PtxOptionsTuple = std::tuple<bool, std::string>;
-
-  explicit PtxCompilationOptions(const HloModuleConfig& hlo_module_config)
-      : xla_gpu_disable_ptxas_optimizations(
-            hlo_module_config.debug_options()
-                .xla_gpu_disable_ptxas_optimizations()),
-        xla_gpu_cuda_data_dir(
-            hlo_module_config.debug_options().xla_gpu_cuda_data_dir()) {}
-
-  // For comparison and hashing.
-  PtxOptionsTuple ToTuple() {
-    return std::make_tuple(xla_gpu_disable_ptxas_optimizations,
-                           xla_gpu_cuda_data_dir);
-  }
-};
-
-// Compiles the given PTX string using ptxas and returns the resulting machine
-// code (i.e. a cubin) as a byte array.
-//
-// Queries stream executor stream_exec to get CUDA compute capability from the
-// device.
-//
-// compile_ptx_options is used to query for the CUDA location in case it is
-// customized in a passed flag, and for controlling ptxas optimizations.
-// It can be constructed from HloModuleConfig.
-StatusOr<std::vector<uint8>> CompilePtx(
-    se::StreamExecutor* stream_exec, absl::string_view ptx,
-    PtxCompilationOptions compile_ptx_options);
-
-// Same as CompilePtx, but caches the result, and returns unowned view of
-// the compiled binary.
-//
-// A copy of the string provided in ptx will be made.
-StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
-    se::StreamExecutor* executor, absl::string_view ptx,
-    PtxCompilationOptions compilation_options);
+// Create PtxCompilationOptions out of HloModuleConfig.
+se::cuda::PtxCompilationOptions PtxOptsFromConfig(
+    const HloModuleConfig& hlo_module_config);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.cc b/tensorflow/compiler/xla/service/gpu/target_util.cc
index 8225cd79a66..746f74b8e45 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/target_util.cc
@@ -36,14 +36,6 @@ struct TargetIntrinsics {
 // corresponding to the give TargetIntrinsicID.
 struct TargetIntrinsics GetIntrinsic(TargetIntrinsicID intrin) {
   switch (intrin) {
-    case TargetIntrinsicID::kShflDownF32: {
-      return {llvm::Intrinsic::nvvm_shfl_sync_down_f32,
-              llvm::Intrinsic::not_intrinsic};
-    }
-    case TargetIntrinsicID::kShflDownI32: {
-      return {llvm::Intrinsic::nvvm_shfl_sync_down_i32,
-              llvm::Intrinsic::not_intrinsic};
-    }
     case TargetIntrinsicID::kThreadIdx: {
       return {llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x,
               llvm::Intrinsic::amdgcn_workitem_id_x};
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.h b/tensorflow/compiler/xla/service/gpu/target_util.h
index b8f796c7259..a7497b91390 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.h
+++ b/tensorflow/compiler/xla/service/gpu/target_util.h
@@ -31,9 +31,7 @@ namespace gpu {
 
 // Enmeration to get target specific intrinsics.
 enum class TargetIntrinsicID {
-  kShflDownF32 = 0,
-  kShflDownI32,
-  kThreadIdx,
+  kThreadIdx = 0,
   kThreadIdy,
   kThreadIdz,
   kBlockIdx,
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index b6ce15bb384..4c229046e14 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -4,9 +4,10 @@
 # TODO(jlebar): None of these tests actually use the GPU, so they should not
 # need to run on machines with GPUs present.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = [":friends"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index bdd06718717..9670a3ece08 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -92,7 +93,7 @@ class Thunk {
   //
   // Precondition: Initialize(stream->parent()) has been called.
   virtual Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                 se::Stream* stream,
+                                 se::Stream* stream, const RunId& run_id,
                                  HloExecutionProfiler* profiler) = 0;
 
  protected:
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
index 5200a2af412..2635a7b3c45 100644
--- a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
@@ -70,7 +70,7 @@ TriangularSolveThunk::TriangularSolveThunk(
 
 Status TriangularSolveThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream,
-    HloExecutionProfiler* profiler) {
+    const RunId& /*run_id*/, HloExecutionProfiler* profiler) {
   VLOG(3) << "uplo=" << se::blas::UpperLowerString(uplo_)
           << " side=" << se::blas::SideString(side_)
           << " diagonal=" << se::blas::DiagonalString(unit_diagonal_)
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
index c947162ea32..94bf6bf6442 100644
--- a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
@@ -49,7 +49,7 @@ class TriangularSolveThunk : public Thunk {
   TriangularSolveThunk& operator=(const TriangularSolveThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
index 989b542ff45..f7dda240367 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@@ -23,7 +23,7 @@ namespace xla {
 namespace gpu {
 
 Status TupleThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                   se::Stream* stream,
+                                   se::Stream* stream, const RunId& /*run_id*/,
                                    HloExecutionProfiler* profiler) {
   auto size = tuple_element_buffers_.size();
   auto tuple_element_buffer_addresses = absl::make_unique<void*[]>(size);
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
index dcdbf2cf3c2..47784c5c373 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
@@ -46,7 +46,7 @@ class TupleThunk : public Thunk {
   TupleThunk& operator=(const TupleThunk&) = delete;
 
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index c4754fe3789..0223582f2a9 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -48,7 +48,7 @@ Status WhileThunk::Initialize(const GpuExecutable& executable,
 }
 
 Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                                   se::Stream* stream,
+                                   se::Stream* stream, const RunId& run_id,
                                    HloExecutionProfiler* profiler) {
   se::DeviceMemoryBase condition_result_data =
       buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);
@@ -59,7 +59,7 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     profiler->StartHloComputation();
     VLOG(3) << "Executing condition computation";
     TF_RETURN_IF_ERROR(condition_thunk_sequence_->ExecuteOnStream(
-        buffer_allocations, stream, profiler));
+        buffer_allocations, stream, run_id, profiler));
     profiler->FinishHloComputation(hlo_instruction()->while_condition());
 
     // Copy the result of condition computation and break the loop if 'false'.
@@ -83,8 +83,8 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     VLOG(3) << "Executing body computation";
     // Invoke thunk sequence for while 'body' computation, and pass on
     // 'profiler' to measure the timing of the thunks in 'body_thunk_sequence_'.
-    TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(buffer_allocations,
-                                                             stream, profiler));
+    TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(
+        buffer_allocations, stream, run_id, profiler));
     profiler->FinishHloComputation(hlo_instruction()->while_body());
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 9270f95ee67..97ac24f61cc 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -49,7 +49,7 @@ class WhileThunk : public Thunk {
   Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
-                         se::Stream* stream,
+                         se::Stream* stream, const RunId& run_id,
                          HloExecutionProfiler* profiler) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
index d09ec15e83a..08ef3eabfb6 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -43,7 +43,7 @@ class HloAliasAnalysis {
   static StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(
       const HloModule* module,
       const HloDataflowAnalysis::FusionCanShareBufferFunction&
-          fusion_can_share_buffer);
+          fusion_can_share_buffer = nullptr);
 
   string ToString() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.h b/tensorflow/compiler/xla/service/hlo_buffer.h
index a81078fdc96..91597d6f705 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.h
+++ b/tensorflow/compiler/xla/service/hlo_buffer.h
@@ -93,6 +93,17 @@ class HloBuffer {
   // Return all values contained in this buffer.
   const std::vector<const HloValue*>& values() const { return values_; }
 
+  // Memory space color. Used to indicate the memory space that the hlo buffer
+  // needs to live in.
+  BufferValue::Color color() const {
+    // Invariant: All values in the buffer should have the same color.
+    BufferValue::Color result = values()[0]->color();
+    for (const HloValue* value : values()) {
+      DCHECK_EQ(result, value->color());
+    }
+    return result;
+  }
+
   // Return the unique HLO value in the buffer. CHECK fails if the buffer does
   // not contain exactly one value.
   const HloValue& GetUniqueValue() const {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 195c84b034f..908c1ad451a 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -166,14 +166,23 @@ Status HloComputation::RemoveParameter(int64 param_no) {
   return Status::OK();
 }
 
-Status HloComputation::RemoveUnusedParameters() {
-  CHECK(IsFusionComputation());
+Status HloComputation::RemoveUnusedParametersFromFusedComputation() {
+  return RemoveUnusedParametersImpl(/*allow_non_fusion=*/false);
+}
+
+Status HloComputation::RemoveUnusedParametersFromAnyComputation() {
+  return RemoveUnusedParametersImpl(/*allow_non_fusion=*/true);
+}
+
+Status HloComputation::RemoveUnusedParametersImpl(bool allow_non_fusion) {
+  CHECK(allow_non_fusion || IsFusionComputation());
   int64 removed = 0;
   for (int64 i = 0; i < param_instructions_.size(); ++i) {
     HloInstruction* param_instruction = param_instructions_[i];
     if (param_instruction->user_count() == 0 &&
         param_instruction != root_instruction()) {
-      TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction));
+      TF_RETURN_IF_ERROR(
+          RemoveInstructionImpl(param_instruction, allow_non_fusion));
       ++removed;
       continue;
     }
@@ -185,14 +194,15 @@ Status HloComputation::RemoveUnusedParameters() {
                                           StrCat("param_", param_no)));
       TF_RETURN_IF_ERROR(param_instruction->ReplaceAllUsesWith(new_instr));
       param_instructions_[param_no] = new_instr;
-      TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction));
+      TF_RETURN_IF_ERROR(
+          RemoveInstructionImpl(param_instruction, allow_non_fusion));
     }
   }
   param_instructions_.resize(param_instructions_.size() - removed);
   return Status::OK();
 }
 
-bool HloComputation::IsRemovable(const HloInstruction* instruction) {
+bool HloComputation::IsSafelyRemovable(const HloInstruction* instruction) {
   // If the instruction has control predecessors or successors then we cannot
   // remove the instruction without violating ordering constraints (added, for
   // example, to avert interference due to buffer aliasing).
@@ -223,7 +233,7 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
   TF_RET_CHECK(root_instruction() != instruction);
 
   TF_RET_CHECK(instruction->user_count() == 0);
-  TF_RET_CHECK(IsRemovable(instruction))
+  TF_RET_CHECK(IsSafelyRemovable(instruction))
       << "Cannot remove instruction: " << instruction->ToString();
   absl::flat_hash_set<HloInstruction*> removed;
   std::queue<HloInstruction*> worklist;
@@ -233,7 +243,7 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
     worklist.pop();
 
     if (removed.contains(item) || item->user_count() != 0 ||
-        item == root_instruction() || !IsRemovable(item) ||
+        item == root_instruction() || !IsSafelyRemovable(item) ||
         (item->HasSideEffect() && item != instruction)) {
       continue;
     }
@@ -248,9 +258,18 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
 }
 
 Status HloComputation::RemoveInstruction(HloInstruction* instruction) {
+  return RemoveInstructionImpl(instruction, /*ignore_safety_check=*/false);
+}
+
+Status HloComputation::ForceRemoveInstruction(HloInstruction* instruction) {
+  return RemoveInstructionImpl(instruction, /*ignore_safety_check=*/true);
+}
+
+Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
+                                             bool ignore_safety_check) {
   VLOG(2) << "Removing instruction " << instruction->name()
           << " from computation " << name();
-  TF_RET_CHECK(IsRemovable(instruction))
+  TF_RET_CHECK(ignore_safety_check || IsSafelyRemovable(instruction))
       << "cannot remove instruction: " << instruction->ToString();
   TF_RET_CHECK(root_instruction() != instruction)
       << "cannot remove root instruction " << instruction->name();
@@ -291,6 +310,16 @@ void HloComputation::set_root_instruction(HloInstruction* new_root_instruction,
   }
   DCHECK(root_found);
 
+  if (parent() && parent()->has_entry_computation() &&
+      parent()->entry_computation() == this) {
+    if (!Shape::Equal()(new_root_instruction->shape(),
+                        root_instruction_->shape())) {
+      // Rebuild input output alias config now that we have a new output shape.
+      parent()->input_output_alias_config() =
+          HloInputOutputAliasConfig(new_root_instruction->shape());
+    }
+  }
+
   root_instruction_ = new_root_instruction;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 89dbe93b36b..ad6cc2fee41 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -115,7 +115,12 @@ class HloComputation {
   // Remove unused parameters from the computation.
   // Note this is only applicatable to the computation for the fusion
   // instruction.
-  Status RemoveUnusedParameters();
+  Status RemoveUnusedParametersFromFusedComputation();
+
+  // Remove unused parameters from the computation. Unlike
+  // RemoveUnusedParametersFromFusedComputation, this function can be used
+  // to remove parameters from non-fusion computations.
+  Status RemoveUnusedParametersFromAnyComputation();
 
   // Adds a new parameter instruction to a fusion computation.
   //
@@ -135,6 +140,11 @@ class HloComputation {
   // users. Instruction is deallocated with this call.
   Status RemoveInstruction(HloInstruction* instruction);
 
+  // Removes an instruction from the computation. The instruction must have no
+  // users. Instruction is deallocated with this call. The instruction will be
+  // removed even if it is marked as not removable.
+  Status ForceRemoveInstruction(HloInstruction* instruction);
+
   // Remove an instruction (including side effecting ones) from the computation
   // and also transitively any operand that has no side effect and no users post
   // removing an instruction. The instruction must have no users. Instruction is
@@ -378,13 +388,13 @@ class HloComputation {
   // the HLO computation with the exception of fusion computation. A parameter
   // instruction is removable for a fusion computation.
   //
-  // Note that IsRemovable() is a necessariy condition to remove an instruction
-  // rather than a sufficient condition. For example, instructions with
-  // side-effect (e.g., Send, Infeed) may be removed from a computation, but the
-  // transformation must guarantee the invariants relevant to the instructions
-  // still hold (e.g., Send and Recv must be removed together to make each
-  // channel complete).
-  bool IsRemovable(const HloInstruction* instruction);
+  // Note that IsSafelyRemovable() is a necassarily condition to remove an
+  // instruction rather than a sufficient condition. For example, instructions
+  // with side-effect (e.g., Send, Infeed) may be removed from a computation,
+  // but the transformation must guarantee the invariants relevant to the
+  // instructions still hold (e.g., Send and Recv must be removed together to
+  // make each channel complete).
+  bool IsSafelyRemovable(const HloInstruction* instruction);
 
   // Returns a map from channel-id to the group of instructions associated with
   // the channel. These instructions will be considered as a single node for
@@ -459,6 +469,11 @@ class HloComputation {
       std::vector<HloInstruction*>* post_order, HloInstruction* root,
       absl::flat_hash_map<HloInstruction*, VisitState>* visited) const;
 
+  Status RemoveUnusedParametersImpl(bool allow_non_fusion);
+
+  Status RemoveInstructionImpl(HloInstruction* instruction,
+                               bool ignore_safety_check);
+
   string name_;
   int64 unique_id_;
   HloInstruction* root_instruction_;
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 9036ae8d5fd..a1586af7b5a 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -149,7 +149,7 @@ string HloDataflowAnalysis::ToString() const {
   StrAppend(&out, "  Instruction value sets:\n");
   for (const HloComputation* computation : module_.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
-      StrAppend(&out, "    ", instruction->name(), ":\n");
+      StrAppend(&out, "Instruction: \n  ", instruction->name(), ":\n");
       if (instruction->shape().IsTuple()) {
         GetInstructionValueSet(instruction)
             .ForEachElement([this, &instruction, &out](
@@ -1044,7 +1044,7 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     }
 
     if (fusion_can_share_buffer_ != nullptr) {
-      return fusion_can_share_buffer_(user, operand);
+      return fusion_can_share_buffer_(user, operand, user_index);
     }
 
     if (user->IsLoopFusion() || user->IsInputFusion()) {
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index ece17fc4c3e..de4ea8a80df 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -49,12 +49,14 @@ class HloDataflowAnalysis {
   // default strategy.
   //
   // The first parameter of the function should be the fusion instruction, the
-  // second parameter should be an operand of the fusion instruction.
+  // second parameter should be an operand of the fusion instruction. The third
+  // parameter should be the output index of the fusion.
   //
   // TODO(b/80315712): Find a better way to tell whether a fusion can share
   // buffer.
   using FusionCanShareBufferFunction = std::function<bool(
-      const HloInstruction* fusion, const HloInstruction* operand)>;
+      const HloInstruction* fusion, const HloInstruction* operand,
+      const ShapeIndex& fusion_index)>;
 
   // Run dataflow analysis on the given module. Parameters:
   //
@@ -128,7 +130,7 @@ class HloDataflowAnalysis {
   int64 value_count() const { return values_.size(); }
 
   // Return a vector of all HloValues stabily sorted by HloValue::Id.
-  const std::vector<const HloValue*>& values() const { return values_vector_; }
+  const std::vector<HloValue*>& values() const { return values_vector_; }
 
   // Return the call graph used for computing the dataflow.
   const CallGraph& call_graph() const { return *call_graph_; }
@@ -153,6 +155,8 @@ class HloDataflowAnalysis {
                                      HloInstruction* user,
                                      const ShapeIndex& user_index) const;
 
+  const HloModule& module() const { return module_; }
+
  protected:
   HloDataflowAnalysis(
       const HloModule& module, bool ssa_form,
@@ -238,7 +242,7 @@ class HloDataflowAnalysis {
   std::vector<HloValue::Id> value_ids_to_delete_;
 
   // A vector containing all HloValues sorted by HloValue::Id.
-  std::vector<const HloValue*> values_vector_;
+  std::vector<HloValue*> values_vector_;
 
   // The Id to use for the next HloValue.
   HloValue::Id next_value_id_ = 0;
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index cb2341a80be..275feab5030 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -2576,7 +2576,8 @@ TEST_F(CanShareOperandBufferWithUserTest, FusionCanShareBufferCustomized) {
   auto fusion = computation_->CreateFusionInstruction(
       {add, two, mul}, HloInstruction::FusionKind::kInput);
   RunAnalysis(/*fusion_can_share_buffer=*/[](const HloInstruction* fusion,
-                                             const HloInstruction*) {
+                                             const HloInstruction*,
+                                             const ShapeIndex& output_index) {
     return fusion->IsLoopFusion();
   });
 
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index a5a11f09cf4..702de4fef86 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -49,7 +49,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
     for (auto* instruction : computation->instructions()) {
       if (instruction != computation->root_instruction() &&
           instruction->user_count() == 0 &&
-          computation->IsRemovable(instruction) &&
+          computation->IsSafelyRemovable(instruction) &&
           !instruction->HasSideEffect()) {
         dead_roots.push_back(instruction);
       }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 0320979102f..21cc216b33b 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <complex>
 #include <cstdlib>
 #include <functional>
 #include <iterator>
@@ -782,26 +783,15 @@ Status HloEvaluator::HandleTuple(HloInstruction* tuple) {
 
 namespace {
 
-// Straightforward implementation of 1D DFT transform. Uses passed-in start
-// index and stride to gather inputs from the data vector into the preallocated
-// buffer, computes the result, and writes it back to the same locations in the
-// data vector. Runs in O(length^2) time.
-//
-// Parameters contract_output and expand_input are used to avoid unnecessary
-// calculations. When contract_output is set to true, then only (length / 2) + 1
-// output values are computed. When expand_input is set to true, then
-// (length / 2) + 1 values from the data set are used to re-create the full set
-// of size 'length', on which the transform is then performed.
-//
-void NaiveDft1D(int64 length, int64 start, int64 stride, bool inverse,
-                bool contract_output, bool expand_input,
-                absl::Span<complex128> data, absl::Span<complex128> buffer) {
-  CHECK_GT(data.size(), start + (length - 1) * stride);
-  CHECK_GT(buffer.size(), length - 1);
-
-  // Copy input data to 1D vector.
+// Common code used by 1D implementations, which copies data from the input to
+// the contiguous buffer. Returns true if all copied values are zero.
+bool GatherToBuffer(absl::Span<complex128> data, int64 length, int64 start,
+                    int64 stride, bool expand_input,
+                    absl::Span<complex128> buffer) {
+  CHECK_GE(buffer.size(), length);
   bool input_is_zero = true;
   const int64 ub = expand_input ? length / 2 + 1 : length;
+  CHECK_GE(data.size(), start + (ub - 1) * stride);
   for (int64 k = 0; k < ub; k++) {
     complex128 value = data[start + k * stride];
     input_is_zero &= value == complex128(0.0, 0.0);
@@ -815,22 +805,118 @@ void NaiveDft1D(int64 length, int64 start, int64 stride, bool inverse,
       }
     }
   }
+  return input_is_zero;
+}
+
+// Returns (conjugated, if 'inverse' is true) k-th twiddle for the given length.
+inline complex128 Twiddle(int64 k, int64 length, bool inverse) {
+  auto coeff = std::exp(complex128(0.0, -2.0 * M_PI * k / length));
+  return inverse ? std::conj(coeff) : coeff;
+}
+
+// Straightforward implementation of 1D DFT transform of arbitrary length. Uses
+// passed-in start index and stride to gather inputs from the data vector into
+// the preallocated buffer, computes the result, and writes it back to the same
+// locations in the data vector. Runs in O(length^2) time.
+//
+// Parameters contract_output and expand_input are used to avoid unnecessary
+// calculations. When contract_output is set to true, then only (length / 2) + 1
+// output values are computed. When expand_input is set to true, then
+// (length / 2) + 1 values from the data set are used to re-create the full set
+// of size 'length', on which the transform is then performed.
+//
+void NaiveDft1D(int64 length, int64 start, int64 stride, bool inverse,
+                bool contract_output, bool expand_input,
+                absl::Span<complex128> data, absl::Span<complex128> buffer) {
+  const bool input_is_zero =
+      GatherToBuffer(data, length, start, stride, expand_input, buffer);
 
-  // Do 1D transformation with double precision.
   if (!input_is_zero) {
     const int64 ub = contract_output ? length / 2 + 1 : length;
     for (int64 k = 0; k < ub; k++) {
       complex128 value = complex128(0.0, 0.0);
       for (int n = 0; n < length; n++) {
-        auto coeff = std::exp(complex128(0.0, -2.0 * M_PI * n * k / length));
-        value += (inverse ? std::conj(buffer[n]) : buffer[n]) * coeff;
+        value += buffer[n] * Twiddle(n * k, length, inverse);
       }
       data[start + k * stride] =
-          inverse ? std::conj(value) / complex128(length, 0.0) : value;
+          inverse ? value / complex128(length, 0.0) : value;
     }
   }
 }
 
+// Non-recursive implementation of the Cooley-Tukey radix-2 decimation in time.
+// Performs 1D FFT transform for the lengths, which are powers of 2. Runs in
+// O(length * log(length)) time. Uses the same parameters as the naive
+// implementation above, except that the preallocated buffer must be at least
+// twice as big as the length of the transform, because the buffer is used to
+// hold both input and output values for each stage of the transform.
+//
+void Fft1D(int64 length, int64 start, int64 stride, bool inverse,
+           bool contract_output, bool expand_input, absl::Span<complex128> data,
+           absl::Span<complex128> buffer) {
+  CHECK(IsPowerOfTwo(static_cast<uint64>(length)));
+  const bool input_is_zero =
+      GatherToBuffer(data, length, start, stride, expand_input, buffer);
+
+  if (!input_is_zero) {
+    auto generate_twiddles = [](int64 length, bool inverse) {
+      std::vector<complex128> twiddles;
+      // Need only half the twiddles.
+      for (int64 k = 0; k < length / 2; k++) {
+        twiddles.push_back(Twiddle(k, length, inverse));
+      }
+      return twiddles;
+    };
+
+    // Indices into the parts of the buffer used for input and output values.
+    int64 in_base = length;
+    int64 out_base = 0;
+
+    // At each stage, we "split" the input data into num_blocks, with block_size
+    // values in each block.
+    for (int64 num_blocks = 1; num_blocks < length; num_blocks *= 2) {
+      // Swap input and output parts of the buffer.
+      std::swap(in_base, out_base);
+      auto twiddles = generate_twiddles(num_blocks * 2, inverse);
+      const int64 block_size = length / num_blocks;
+      const int64 next_iteration_block_size = block_size / 2;
+      for (int64 block = 0; block < num_blocks; block++) {
+        const int64 in_offset = in_base + block * block_size;
+        const int64 out_offset = out_base + block * next_iteration_block_size;
+        // For each (even, odd) pair of values in the block, calculate two
+        // output values as even + twiddle * odd and even - twiddle * odd.
+        for (int64 pair = 0; pair < block_size / 2; pair++) {
+          const complex128 even = buffer[in_offset + pair];
+          const complex128 odd = buffer[in_offset + block_size / 2 + pair];
+          const complex128 twiddled_odd = twiddles[block] * odd;
+          buffer[out_offset + pair] = even + twiddled_odd;
+          buffer[out_offset + length / 2 + pair] = even - twiddled_odd;
+        }
+      }
+    }
+    // Copy computed result back to data.
+    const int64 ub = contract_output ? length / 2 + 1 : length;
+    for (int64 k = 0; k < ub; k++) {
+      complex128 value = buffer[out_base + k];
+      data[start + k * stride] =
+          inverse ? value / complex128(length, 0.0) : value;
+    }
+  }
+}
+
+// Determine, which implementation of 1D transform to use and call it.
+void Dft1D(int64 length, int64 start, int64 stride, bool inverse,
+           bool contract_output, bool expand_input, absl::Span<complex128> data,
+           absl::Span<complex128> buffer) {
+  if (IsPowerOfTwo(static_cast<uint64>(length))) {
+    Fft1D(length, start, stride, inverse, contract_output, expand_input, data,
+          buffer);
+  } else {
+    NaiveDft1D(length, start, stride, inverse, contract_output, expand_input,
+               data, buffer);
+  }
+}
+
 // Helper to reverse the order of dimension lengths in the passed-in literal.
 std::vector<int64> GetDimensionLengths(const Literal& literal) {
   std::vector<int64> lengths = literal.shape().dimensions();
@@ -906,8 +992,8 @@ void Sweep(int64 fft_rank, FftType fft_type,
       const int64 stride = fft_strides[sweep_axis];
       const bool expand_input = input_is_truncated && sweep_axis == 0;
       const bool contract_oputput = output_is_truncated && sweep_axis == 0;
-      NaiveDft1D(length, start, stride, inverse, contract_oputput, expand_input,
-                 data, buffer);
+      Dft1D(length, start, stride, inverse, contract_oputput, expand_input,
+            data, buffer);
     } else if (axis == sweep_axis) {
       // Visit only the elements with coordinate 0 along the sweep axis.
       sweep(sweep_axis, axis - 1, start);
@@ -1207,10 +1293,10 @@ Status CheckParameters(const Shape& input_shape, const Shape& output_shape,
 
 }  // namespace
 
-// Flexible but slow implementation of the discrete Fourier transform. All
-// transform types (FFT, IFFT, RFFT, and IRFFT) are supported, as well as the
-// arbitrary rank and length of each dimension of the transform, and arbitrary
-// layouts of the input and output literals.
+// Flexible implementation of the discrete Fourier transform. All transform
+// types (FFT, IFFT, RFFT, and IRFFT) are supported, as well as the arbitrary
+// rank and length of each dimension of the transform, and arbitrary layouts of
+// the input and output literals.
 //
 // The input literal in operand 0 provides input data, which must be complex64
 // for FFT, IFFT, IRFFT transforms and float for RFFT. The transform is computed
@@ -1241,15 +1327,18 @@ Status CheckParameters(const Shape& input_shape, const Shape& output_shape,
 // complex64[64][16][9] input array will use all input values and will produce
 // float[64][16][16] output.
 //
-// The implementation of the 1D transform is a straightforward loop nest. The
-// transforms of higher ranks apply sets of 1D transforms along each axis. For
-// example, the 2D transform is computed by applying 1D transforms to each
-// column followed by applying 1D transforms to each row.
+// The implementation of the 1D transform for lengths, that are powers of 2, is
+// the Cooley-Tukey radix-2 decimation-in-time. For all other 1D transform
+// lengths, a straightforward, but slow, loop nest is used. The transforms of
+// higher ranks apply sets of 1D transforms along each axis. For example, the 2D
+// transform is computed by applying 1D transforms to each column followed by
+// applying 1D transforms to each row.
 //
 // In general, a transform of rank n runs in O(N0*N1*...*Nn*(N0+N1+...+Nn))
-// time, where Ni is the length of the transform's i-th dimension. It is
-// possible to reduce the run time to O(N0*N1*...(log(N0)+log(N1)+...)) by
-// plugging in a more efficient 1D implementation.
+// time, where Ni is the length of the transform's i-th dimension. However, for
+// dimension lengths, which are powers of 2, the run time along these dimensions
+// is reduced to log(Ni) in the summation, giving the runtime of
+// O(N0*N1*...*Nn*(log(N0)+log(N1)+...+log(Nn)) in the best case.
 //
 Status HloEvaluator::HandleFft(HloInstruction* fft) {
   const FftType fft_type = fft->fft_type();
@@ -1275,8 +1364,14 @@ Status HloEvaluator::HandleFft(HloInstruction* fft) {
     // Linearized working data set.
     std::vector<complex128> data(fft_size);
 
-    // Temporary buffer allocated once and used in 1D sweeps.
-    std::vector<complex128> buffer(*absl::c_max_element(fft_lengths));
+    // Temporary buffer allocated once and used in 1D sweeps. For dimension
+    // length values that are powers of 2, the buffer should be twice as large.
+    int64 buffer_size = 0;
+    for (auto len : fft_lengths) {
+      int64 size = IsPowerOfTwo(static_cast<uint64>(len)) ? len * 2 : len;
+      buffer_size = std::max(buffer_size, size);
+    }
+    std::vector<complex128> buffer(buffer_size);
 
     // Sizes of each axis of input and output literals.
     const auto input_lengths = GetDimensionLengths(input_literal);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index c4266f95fcc..888434774bb 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -129,6 +129,62 @@ class HloEvaluatorTest : public HloTestBase {
     EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
   }
 
+  std::unique_ptr<HloComputation> MaxComputationScalarF32() {
+    HloComputation::Builder max_computation("max");
+    Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    auto param_lhs = max_computation.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+    auto param_rhs = max_computation.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+    max_computation.AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
+    return max_computation.Build();
+  }
+
+  void ReduceWindowMaxIotaTest(int window_size, int padding, int stride,
+                               int window_dilation, int base_dilation,
+                               const Literal& expected) {
+    HloComputation::Builder b(TestName());
+
+    // arg:
+    // f32[4,4] {
+    //  {  0,  1,  2,  3 },
+    //  {  4,  5,  6,  7 },
+    //  {  8,  9, 10, 11 },
+    //  { 12, 13, 14, 15 }
+    // }
+    auto arg_array = absl::make_unique<Array2D<float>>(4, 4);
+    arg_array->FillIota(0);
+    auto arg_literal = LiteralUtil::CreateR2FromArray2D<float>(*arg_array);
+
+    HloInstruction* arg_instruction = b.AddInstruction(
+        HloInstruction::CreateConstant(std::move(arg_literal)));
+    auto init_value = b.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.f)));
+    auto max_func = m_->AddEmbeddedComputation(MaxComputationScalarF32());
+
+    Window window;
+    WindowDimension dim;
+    dim.set_size(window_size);
+    dim.set_stride(stride);
+    dim.set_padding_low(padding);
+    dim.set_padding_high(padding);
+    dim.set_window_dilation(window_dilation);
+    dim.set_base_dilation(base_dilation);
+    *window.add_dimensions() = dim;
+    *window.add_dimensions() = dim;
+
+    int dim0 = expected.shape().dimensions(0);
+    int dim1 = expected.shape().dimensions(1);
+    Shape shape = ShapeUtil::MakeShape(F32, {dim0, dim1});
+    b.AddInstruction(HloInstruction::CreateReduceWindow(
+        shape, arg_instruction, init_value, window, max_func));
+
+    m_->AddEntryComputation(b.Build());
+    TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+
  protected:
   explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {
     InitializeFftData();
@@ -2585,16 +2641,7 @@ TEST_P(HloEvaluatorBf16Test, ReduceWindowMax) {
 
   auto init_value = b.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.f)));
-
-  HloComputation::Builder max_computation("max");
-  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto param_lhs = max_computation.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-  auto param_rhs = max_computation.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-  max_computation.AddInstruction(HloInstruction::CreateBinary(
-      scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
-  auto max_func = m_->AddEmbeddedComputation(max_computation.Build());
+  auto max_func = m_->AddEmbeddedComputation(MaxComputationScalarF32());
 
   Window window;
   WindowDimension dim;
@@ -2619,56 +2666,79 @@ TEST_P(HloEvaluatorBf16Test, ReduceWindowMax) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxWindowDilation) {
-  HloComputation::Builder b(TestName());
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxIotaWindowDilation) {
+  auto expected = LiteralUtil::CreateR2<float>({{10, 11}, {14, 15}});
+  ReduceWindowMaxIotaTest(
+      /*window_size=*/2,
+      /*padding=*/0,
+      /*stride=*/1,
+      /*window_dilation=*/2,
+      /*base_dilation=*/1,
+      /*expected=*/expected);
+}
 
-  // arg:
-  // f32[3,3] {
-  //  { 1, 2, 3 },
-  //  { 5, 6, 7 },
-  //  { 9, 10, 11 },
-  // }
-  auto arg_array = absl::make_unique<Array2D<float>>(3, 3);
-  arg_array->FillUnique(1.0f);
-  auto arg_literal = LiteralUtil::CreateR2FromArray2D<float>(*arg_array);
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxIotaStrideWindowDilation) {
+  auto expected = LiteralUtil::CreateR2<float>({{10}});
+  ReduceWindowMaxIotaTest(
+      /*window_size=*/2,
+      /*padding=*/0,
+      /*stride=*/2,
+      /*window_dilation=*/2,
+      /*base_dilation=*/1,
+      /*expected=*/expected);
+}
 
-  HloInstruction* arg_instruction =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxIotaBaseDilation) {
+  auto expected = LiteralUtil::CreateR2<float>({{0, 1, 1, 2, 2, 3},
+                                                {4, 5, 5, 6, 6, 7},
+                                                {4, 5, 5, 6, 6, 7},
+                                                {8, 9, 9, 10, 10, 11},
+                                                {8, 9, 9, 10, 10, 11},
+                                                {12, 13, 13, 14, 14, 15}});
+  ReduceWindowMaxIotaTest(
+      /*window_size=*/2,
+      /*padding=*/0,
+      /*stride=*/1,
+      /*window_dilation=*/1,
+      /*base_dilation=*/2,
+      /*expected=*/expected);
+}
 
-  auto init_value = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.f)));
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxIotaStrideBaseDilation) {
+  auto expected =
+      LiteralUtil::CreateR2<float>({{0, 1, 2}, {4, 5, 6}, {8, 9, 10}});
+  ReduceWindowMaxIotaTest(
+      /*window_size=*/2,
+      /*padding=*/0,
+      /*stride=*/2,
+      /*window_dilation=*/1,
+      /*base_dilation=*/2,
+      /*expected=*/expected);
+}
 
-  HloComputation::Builder max_computation("max");
-  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto param_lhs = max_computation.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-  auto param_rhs = max_computation.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-  max_computation.AddInstruction(HloInstruction::CreateBinary(
-      scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
-  auto max_func = m_->AddEmbeddedComputation(max_computation.Build());
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxIotaStrideBothDilation) {
+  auto expected =
+      LiteralUtil::CreateR2<float>({{5, 6, 7}, {9, 10, 11}, {13, 14, 15}});
+  ReduceWindowMaxIotaTest(
+      /*window_size=*/2,
+      /*padding=*/0,
+      /*stride=*/2,
+      /*window_dilation=*/2,
+      /*base_dilation=*/2,
+      /*expected=*/expected);
+}
 
-  Window window;
-  WindowDimension dim;
-  dim.set_size(2);
-  dim.set_stride(1);
-  dim.set_padding_low(0);
-  dim.set_padding_high(0);
-  dim.set_window_dilation(2);
-  dim.set_base_dilation(1);
-  *window.add_dimensions() = dim;
-  *window.add_dimensions() = dim;
-
-  Shape shape = ShapeUtil::MakeShape(F32, {1, 1});
-  b.AddInstruction(HloInstruction::CreateReduceWindow(
-      shape, arg_instruction, init_value, window, max_func));
-
-  m_->AddEntryComputation(b.Build());
-
-  TF_ASSERT_OK_AND_ASSIGN(Literal result, Evaluate());
-
-  auto expected = LiteralUtil::CreateR2<float>({{11}});
-  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxIotaPaddingStrideBaseDilation) {
+  // The base is dilated first, and then padding is applied, hence this result.
+  auto expected =
+      LiteralUtil::CreateR2<float>({{0, 2, 3}, {8, 10, 11}, {12, 14, 15}});
+  ReduceWindowMaxIotaTest(
+      /*window_size=*/3,
+      /*padding=*/1,
+      /*stride=*/3,
+      /*window_dilation=*/1,
+      /*base_dilation=*/2,
+      /*expected=*/expected);
 }
 
 TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd) {
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index c3b5838cf0a..a6a84d226f5 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -2673,16 +2673,27 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       std::vector<int64> base_index(rank);
       bool out_of_bound = false;
       for (int64 i = 0; i < rank; ++i) {
+        // Padding is applied to the dilated base. Say that padding is 3 and
+        // dilation is 2 for some dimension. After applying base dilation and
+        // padding, the dimension looks like:
+        // P P P E D D E D D ... E D D E P P P
+        // where E are the elements and D are the holes. So, the elements are
+        // located in indices: padding + k*base_dilation for k = {0, 1, 2, ...}.
+        // We are accessing elements in the transformed base at indices:
+        // window_count_index * stride + window_index * window_dilation.
+        // Solving for k gives us
+        // (win_count_i * stride + win_i * win_dilation - pad) / base_dilation
+        // When this is a natural number, we index an original element.
+        // Otherwise, we index a 0 (pad or hole), and we don't need to apply
+        // the callback f.
         base_index[i] =
             window_count_index[i] * window.dimensions(i).stride() +
             window_index[i] * window.dimensions(i).window_dilation() -
             window.dimensions(i).padding_low();
-        // We are not in the base area if the dilation placed us out of bounds.
         if (base_index[i] % window.dimensions(i).base_dilation() != 0) {
           out_of_bound = true;
           break;
         }
-        // Apply the dilation to the base area.
         base_index[i] /= window.dimensions(i).base_dilation();
         if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
           out_of_bound = true;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 7a6d563b83f..934c96d7630 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1596,8 +1596,8 @@ Status HloFusionInstruction::DeduplicateFusionOperands() {
   if (operands_to_remove.empty()) {
     return Status::OK();
   }
-  TF_RETURN_IF_ERROR(
-      fused_instructions_computation()->RemoveUnusedParameters());
+  TF_RETURN_IF_ERROR(fused_instructions_computation()
+                         ->RemoveUnusedParametersFromFusedComputation());
   RemoveOperandsAtAscendingIndices(operands_to_remove);
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 2c63247eea8..142b8f18ee6 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -109,6 +109,8 @@ class HloModule {
     return entry_computation_;
   }
 
+  bool has_entry_computation() const { return entry_computation_ != nullptr; }
+
   // Returns the root instruction shape of entry computation.
   //
   // Precondition: entry_computation_ is not nullptr.
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 5ba390acfd4..9fb0cd7e077 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -207,7 +207,7 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
   stream.Init();
   ServiceExecutableRunOptions service_run_options =
       GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
-                                    nullptr);
+                                    nullptr, RunId());
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
                       CreateExecutable(std::move(module), run_hlo_passes));
@@ -243,7 +243,7 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
   stream.Init();
   ServiceExecutableRunOptions service_run_options =
       GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
-                                    nullptr);
+                                    nullptr, RunId());
 
   TF_ASSIGN_OR_RETURN(
       ScopedShapedBuffer retval,
@@ -294,6 +294,7 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
       options.num_replicas * options.arguments.size() + 1);
   std::vector<absl::Span<const ShapedBuffer* const>> argument_buffer_slices;
   int64 index = 0;
+  RunId run_id;
   for (int64 i = 0; i < options.num_replicas; ++i) {
     int64 device = (*device_assignment)(i, 0);
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
@@ -301,7 +302,7 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     streams.push_back(absl::make_unique<se::Stream>(executor));
     streams.back()->Init();
     service_run_options.emplace_back(GetServiceRunOptionsForDevice(
-        device, streams.back().get(), device_assignment));
+        device, streams.back().get(), device_assignment, run_id));
 
     // Copy arguments to device.
     for (const Literal* argument : options.arguments) {
@@ -443,7 +444,8 @@ StatusOr<std::unique_ptr<Executable>> HloRunner::CreateExecutable(
 }
 
 ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
-    int64 device, se::Stream* stream, DeviceAssignment* device_assignment) {
+    int64 device, se::Stream* stream, DeviceAssignment* device_assignment,
+    RunId run_id) {
   ExecutableRunOptions run_options;
   run_options.set_device_ordinal(device);
   run_options.set_stream(stream);
@@ -453,6 +455,7 @@ ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
   if (device_assignment != nullptr) {
     run_options.set_device_assignment(device_assignment);
   }
+  run_options.set_run_id(run_id);
   return ServiceExecutableRunOptions(run_options, backend().StreamBorrower());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 7e666a8186e..c077ccd95fe 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -206,7 +206,8 @@ class HloRunner {
   // will be used to configure the replication parameters. Replicated executions
   // should pass the device_assignment parameter.
   ServiceExecutableRunOptions GetServiceRunOptionsForDevice(
-      int64 device, se::Stream* stream, DeviceAssignment* device_assignment);
+      int64 device, se::Stream* stream, DeviceAssignment* device_assignment,
+      RunId run_id);
 
   std::unique_ptr<Backend> backend_;
 };
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index ba856fc17af..18ab401bc89 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -91,7 +91,8 @@ string HloValue::ToShortString() const {
                          ? defining_index().ToString()
                          : "";
   return StrCat(id(), " ", is_phi_ ? "PHI " : "",
-                defining_instruction()->name(), index_str);
+                defining_instruction()->name(), index_str, " @",
+                (has_color() ? color().value() : -1));
 }
 
 string HloValue::ToString(int indent) const {
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 7f0c1ccc728..feb3db64048 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -3,9 +3,10 @@ load(
     "if_static",
 )
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "interpreter_transfer_manager",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index e1303f60779..72813d493cf 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #    Libraries for helping construct LLVM IR for XLA backends.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = [":friends"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
@@ -39,6 +40,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index 761c6879db8..cd1431aa709 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <map>
 
+#include "absl/container/flat_hash_set.h"
 #include "llvm/IR/MDBuilder.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
@@ -40,15 +41,14 @@ void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
     // with our temporary buffers.
     buffer_slice = BufferAllocation::Slice(kParameterAllocation, 0, 0);
   } else {
-    const std::set<BufferAllocation::Slice> slices =
-        assignment_.GetAllSlices(&hlo, index);
-    if (slices.empty() || slices.size() > 1) {
+    auto unique_slice = assignment_.GetUniqueSlice(&hlo, index);
+    if (!unique_slice.ok()) {
       // Skip HLOs which don't have a buffer assigned or for which the
       // buffer can't be determined statically. We cannot determine their
       // aliasing properties in these cases.
       return;
     }
-    buffer_slice = *slices.begin();
+    buffer_slice = unique_slice.ValueOrDie();
   }
 
   if (module_.config().debug_options().xla_llvm_enable_alias_scope_metadata()) {
@@ -134,15 +134,26 @@ llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer(
   // 3. Operands of the given hlo.
   //
   // This set can be increased as we need.
-  std::vector<const LogicalBuffer*> worklist;
+  std::vector<const BufferValue*> worklist;
+  absl::flat_hash_set<const HloInstruction*> added_to_worklist;
   auto add_buffers_to_worklist =
-      [&worklist, &assignment](const HloInstruction* instruction) {
+      [&](const HloInstruction* instruction) {
+        // Buffers of parameters cannot be added to the noalias set.
+        if (instruction->opcode() == HloOpcode::kParameter) {
+          return;
+        }
+        if (added_to_worklist.contains(instruction)) {
+          return;
+        }
+        added_to_worklist.insert(instruction);
         ShapeUtil::ForEachSubshape(
             instruction->shape(),
             [&](const Shape& /*shape*/, const ShapeIndex& index) {
-              for (const LogicalBuffer* buffer :
+              for (const BufferValue* buffer :
                    assignment.GetSourceBuffers(instruction, index)) {
-                worklist.push_back(buffer);
+                if (assignment.HasAllocation(*buffer)) {
+                  worklist.push_back(buffer);
+                }
               }
             });
       };
@@ -160,12 +171,7 @@ llvm::MDNode* AliasAnalysis::GetNoaliasMetadataForBuffer(
   }
 
   std::set<BufferAllocation::Slice> buffers;
-  for (const LogicalBuffer* buffer : worklist) {
-    // Skip buffers which cannot be added to the noalias set.
-    if (!assignment.HasAllocation(*buffer) ||
-        buffer->instruction()->opcode() == HloOpcode::kParameter) {
-      continue;
-    }
+  for (const BufferValue* buffer : worklist) {
     const BufferAllocation::Slice noalias_slice =
         assignment.GetAssignedAllocation(*buffer).GetSlice(*buffer);
     // Our buffer must not overlap with the noalias slice.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 4974cb57db3..ba199f35712 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -23,6 +23,37 @@ limitations under the License.
 namespace xla {
 namespace llvm_ir {
 
+bool MayBeImplementedAsInPlaceDynamicUpdateSlice(const HloInstruction* instr) {
+  // Today we can't emit a dynamic-update-slice if the DUS node is parallized;
+  // the emitter will not emit correct code.  It's possible to change this, but
+  // then ParallelTaskAssigner would have to somehow know whether a node *will*
+  // be emitted as an in-place DUS, and it can't, because it doesn't have a
+  // buffer assignment when it runs.
+  if (!instr->outer_dimension_partitions().empty()) {
+    return false;
+  }
+
+  // Until we know the final buffer assignment, any unfused dynamic-update-slice
+  // might be implementable as an in-place DUS.
+  if (instr->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    return true;
+  }
+
+  // A fusion may be implementable as an in-place dynamic update slice if
+  //  - it's a loop fusion,
+  //  - dynamic-update-slice is the root of the fusion, and
+  //  - operand 0 of the dynamic-update-slice is a parameter to the fusion
+  //    (ignoring any get-tuple-element operations in the way).
+  if (instr->IsLoopFusion()) {
+    const HloInstruction* fused_root = instr->fused_expression_root();
+    return fused_root->opcode() == HloOpcode::kDynamicUpdateSlice &&
+           fused_root->operand(0)->LatestNonGteAncestor()->opcode() ==
+               HloOpcode::kParameter;
+  }
+
+  return false;
+}
+
 bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
                                   const BufferAssignment& assignment) {
   CHECK_EQ(HloOpcode::kDynamicUpdateSlice, dynamic_update_slice->opcode());
@@ -32,6 +63,29 @@ bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
          assignment.SharesTopLevelSlice(dynamic_update_slice, operand);
 }
 
+bool CanEmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
+                                           const BufferAssignment& assignment) {
+  CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
+  if (!MayBeImplementedAsInPlaceDynamicUpdateSlice(fusion)) {
+    return false;
+  }
+
+  // Walk DynamicUpdateSlice operand(0) to fused parameter and get its
+  // associated operand. See if it shares an allocation with this operand.
+  HloInstruction* fused_root = fusion->fused_expression_root();
+  HloInstruction* fusion_operand;
+  ShapeIndex index;
+  std::tie(fusion_operand, index) =
+      fused_root->mutable_operand(0)->LatestNonGteAncestorAndIndex();
+  // MayBeImplementedAsInPlaceDynamicUpdateSlice should have ensured that
+  // fusion_operand is a parameter.
+  CHECK_EQ(fusion_operand->opcode(), HloOpcode::kParameter);
+  auto* operand = fusion->operand(fusion_operand->parameter_number());
+  return assignment.HasAllocationAt(operand, index) &&
+         assignment.HasAllocationAt(fusion, {}) &&
+         assignment.SharesSliceAtIndex(fusion, {}, operand, index);
+}
+
 // Shared implementation of EmitDynamicUpdateSliceInPlace and
 // EmitFusedDynamicUpdateSliceInPlace.
 //
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
index c4da28229d0..70dc368d5d7 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -30,6 +30,22 @@ namespace llvm_ir {
 using GeneratorForOperandIrArrays =
     std::function<std::vector<llvm_ir::IrArray>()>;
 
+// Determines whether the given instruction might be implemented as an
+// in-place dynamic-update-slice after we have a buffer assignment.
+//
+// If this returns false, then CanUpdateDynamicSliceInPlace and
+// CanEmitFusedDynamicUpdateSliceInPlace will also return false.
+//
+// This is useful if you want to check whether an instruction might be an
+// in-place DUS during an HLO pass, at which point you don't have a buffer
+// assignment.
+//
+// Note that simplifications to the HLO graph might change this function from
+// returning false to returning true.  Specifically, simplifying the contents of
+// fusion nodes might cause a false->true transition.  In general this isn't a
+// problem by the time you're calling this function, but beware.
+bool MayBeImplementedAsInPlaceDynamicUpdateSlice(const HloInstruction* instr);
+
 // Checks if we can emit code for the given DynamicUpdateSlice node that updates
 // its input in place.  Returns true if the dynamic-update-slice's
 // array-to-be-updated and output share the same BufferAllocation::Slice.
@@ -40,28 +56,8 @@ bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
 
 // Checks if the given fusion node is amenable to being implemented by
 // EmitFusedDynamicUpdateSliceInPlace.
-inline bool CanEmitFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion, const BufferAssignment& assignment) {
-  CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
-  HloInstruction* fused_root = fusion->fused_expression_root();
-  if (fused_root->opcode() != HloOpcode::kDynamicUpdateSlice ||
-      !fusion->IsLoopFusion()) {
-    return false;
-  }
-  // Walk DynamicUpdateSlice operand(0) to fused parameter and get its
-  // associated operand. See if it shares an allocation with this operand.
-  HloInstruction* fusion_operand;
-  ShapeIndex index;
-  std::tie(fusion_operand, index) =
-      fused_root->mutable_operand(0)->LatestNonGteAncestorAndIndex();
-  if (fusion_operand->opcode() != HloOpcode::kParameter) {
-    return false;
-  }
-  auto* operand = fusion->operand(fusion_operand->parameter_number());
-  return assignment.HasAllocationAt(operand, index) &&
-         assignment.HasAllocationAt(fusion, {}) &&
-         assignment.SharesSliceAtIndex(fusion, {}, operand, index);
-}
+bool CanEmitFusedDynamicUpdateSliceInPlace(HloInstruction* fusion,
+                                           const BufferAssignment& assignment);
 
 // Emits IR for running the given dynamic-update-slice op in-place -- that is,
 // where the input and output buffers share the same slice, so we can simply
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index 886a0545624..75e704bac66 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -263,12 +263,18 @@ PlatformUtil::GetStreamExecutors(
     // Block here in thread_pool destructor until all devices are initialized.
   }
   VLOG(1) << "Device initialization complete";
-  if (absl::c_all_of(stream_executors,
-                     [](se::StreamExecutor* s) { return s == nullptr; })) {
+
+  std::vector<se::StreamExecutor*> out;
+  for (se::StreamExecutor* executor : stream_executors) {
+    if (executor != nullptr) {
+      out.push_back(executor);
+    }
+  }
+  if (out.empty()) {
     return InternalError("no supported devices found for platform %s",
                          platform->Name());
   }
-  return stream_executors;
+  return out;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h
index 592b20282f3..5764f2c11d9 100644
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@@ -58,9 +58,7 @@ class PlatformUtil {
   static StatusOr<se::Platform*> GetPlatformExceptFor(
       const string& platform_name);
 
-  // Returns a vector of StreamExecutors for the given platform. The vector is
-  // indexed by device ordinal (device numbering used by StreamExecutor). If an
-  // element is nullptr, then the device is present by not supported by XLA.
+  // Returns a vector of StreamExecutors for the given platform.
   // If populated, only the devices in allowed_devices will have
   // their StreamExecutors initialized, otherwise all StreamExecutors will be
   // initialized and returned.
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 7fc66310ee7..58028aebe1f 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -24,7 +24,7 @@ limitations under the License.
 namespace xla {
 
 // Class containing options for running a LocalExecutable and other auxiliary
-// data, now only a stream cache for GPU backend.
+// data.
 class ServiceExecutableRunOptions {
  public:
   using StreamBorrower = std::function<StatusOr<StreamPool::Ptr>(int)>;
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 999e8a9c0ac..cec954645cc 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -46,7 +46,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Don't try this transformation if the while loop isn't removable, since if
   // it succeeds ultimately we're going to have to replace the old while loop
   // with a new one.
-  if (!while_op->parent()->IsRemovable(while_op)) {
+  if (!while_op->parent()->IsSafelyRemovable(while_op)) {
     VLOG(2) << "Can't remove dead parameters from non-removable while op.";
     return false;
   }
@@ -455,7 +455,7 @@ static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
 static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
   // Cowardly refuse to remove loops that are not removable.  In practice, this
   // means that we can't remove loops that have control predecessors/successors.
-  if (!while_op->parent()->IsRemovable(while_op)) {
+  if (!while_op->parent()->IsSafelyRemovable(while_op)) {
     VLOG(2) << "Not attempting to remove while loop that is not removable: "
             << while_op->ToShortString();
     return false;
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
index 661b7aa7d99..4c221e2c116 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
@@ -35,7 +35,7 @@ StatusOr<bool> ZeroSizedHloElimination::Run(HloModule* module) {
           instruction->opcode() == HloOpcode::kConstant) {
         continue;
       }
-      if (comp->IsRemovable(instruction) &&
+      if (comp->IsSafelyRemovable(instruction) &&
           ShapeUtil::IsZeroElementArray(instruction->shape())) {
         // If the instruction doesn't have a layout, use a default layout for
         // the literal.
diff --git a/tensorflow/compiler/xla/status_macros.h b/tensorflow/compiler/xla/status_macros.h
index 315136acc71..c37087cb2c8 100644
--- a/tensorflow/compiler/xla/status_macros.h
+++ b/tensorflow/compiler/xla/status_macros.h
@@ -187,28 +187,4 @@ class StatusAdaptorForMacros {
       .with_log_stack_trace()                                             \
       .add_ret_check_failure(#condition)
 
-#define TF_ASSERT_OK_AND_ASSIGN(lhs, rexpr)                             \
-  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                         \
-      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
-      rexpr);
-
-#define TF_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr)  \
-  auto statusor = (rexpr);                                  \
-  ASSERT_TRUE(statusor.status().ok()) << statusor.status(); \
-  lhs = std::move(statusor.ValueOrDie())
-
-#define TF_STATUS_MACROS_CONCAT_NAME(x, y) TF_STATUS_MACROS_CONCAT_IMPL(x, y)
-#define TF_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
-
-#define TF_ASSIGN_OR_RETURN(lhs, rexpr) \
-  TF_ASSIGN_OR_RETURN_IMPL(             \
-      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, rexpr)
-
-#define TF_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr) \
-  auto statusor = (rexpr);                             \
-  if (TF_PREDICT_FALSE(!statusor.ok())) {              \
-    return statusor.status();                          \
-  }                                                    \
-  lhs = std::move(statusor.ValueOrDie())
-
 #endif  // TENSORFLOW_COMPILER_XLA_STATUS_MACROS_H_
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index cff87c59938..b2ba65eb46d 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -8,10 +8,9 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 package_group(
@@ -1715,7 +1714,7 @@ xla_test(
         # This test is tagged "manual" because it requires multiple GPUs, and
         # Forge only supports single-GPU tests.  Guitar skips "manual" tests
         # unless they're also tagged "guitar".
-        "noguitar",  # TODO(b/131524578): Re-enable this.
+        "guitar",
         "manual",
         "multi_gpu",
         "no_oss",
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index d700437ed35..daaf332ed0f 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -105,7 +105,7 @@ class ClientLibraryTestBase : public ::testing::Test {
       const Shape* shape_with_output_layout = nullptr);
 
   // This executes the computation via the reference client (which connects a
-  // interpreter backend). The result is used as the expected values of the
+  // interpreter backend). The result is used as the expected value of the
   // computation.
   StatusOr<Literal> ExecuteAndTransferReference(
       const XlaComputation& computation,
@@ -385,6 +385,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   StatusOr<std::pair<Literal, Literal>> ComputeValueAndReference(
       XlaBuilder* builder, absl::Span<const Literal> arguments);
 
+  // Converts an f32 literal to bf16 if use_bfloat16_ is true.
+  Literal MaybeConvertLiteralToBfloat16(const Literal& literal);
+
   LocalClient* client_;
   LocalClient* ref_client_;  // To compute reference result.
   ExecutionOptions execution_options_;
@@ -402,8 +405,7 @@ class ClientLibraryTestBase : public ::testing::Test {
                                const string& error_message)>& verify_output,
       const Shape* output_with_layout = nullptr);
 
-  // Converts an f32 shape/literal to bf16 if use_bfloat16_ is true.
-  Literal MaybeConvertLiteralToBfloat16(const Literal& literal);
+  // Converts an f32 shape to bf16 if use_bfloat16_ is true.
   Shape MaybeConvertShapeToBfloat16(const Shape& shape);
 
   // Whether to run tests with all float-type input/output converted to
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 7eaa2791d47..2843d77607e 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -117,8 +117,10 @@ LocalClientTestBase::LocalClientTestBase(se::Platform* platform)
     : local_client_(
           ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()),
       thread_pool_wrapper_(new EigenThreadPoolWrapper()) {
+  // Take the first executor, since it's the default one.
   stream_executor_ = PlatformUtil::GetStreamExecutors(local_client_->platform())
-                         .ValueOrDie()[local_client_->default_device_ordinal()];
+                         .ValueOrDie()
+                         .front();
   transfer_manager_ =
       TransferManager::GetForPlatform(local_client_->platform()).ValueOrDie();
 }
diff --git a/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc b/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc
index 7895895e3e7..da0c94c8fa9 100644
--- a/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/multi_device_all_reduce_test.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 
 // Tests cross-GPU all-reduce operatons.
 //
@@ -210,5 +212,43 @@ XLA_TEST_F(MultiDeviceAllReduceTest, NcclChannelCaching) {
   EXPECT_THAT(OpenNcclChannels(), IsEmpty());
 }
 
+// Runs the same executable many times concurrently.  The all-reduces should not
+// conflict with one another.
+XLA_TEST_F(MultiDeviceAllReduceTest, ManyConcurrentAllReduces) {
+  const int64 kNumElems = 1024;
+  const int64 kNumThreads = 200;
+  const int64 kRunsPerThread = 10;
+
+  auto config = GetModuleConfigForTest();
+  config.set_replica_count(2);
+  auto executable = test_runner_
+                        .CreateExecutable(MakeCrsModule(kNumElems, config),
+                                          /*run_hlo_passes=*/true)
+                        .ValueOrDie();
+  std::vector<int64> devices = {0, 1};
+  auto device_assn = MakeDeviceAssn(devices);
+
+  std::vector<float> input_vec(kNumElems);
+  absl::c_iota(input_vec, 0);
+  auto input_literal = LiteralUtil::CreateR1<float>(input_vec);
+  HloRunner::ReplicatedExecuteOptions opts;
+  opts.num_replicas = devices.size();
+  opts.use_threads = true;
+  opts.arguments.push_back(&input_literal);
+
+  tensorflow::BlockingCounter done(kNumThreads * kRunsPerThread);
+  tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), TestName(),
+                                      kNumThreads);
+  for (int64 i = 0; i < kNumThreads * kRunsPerThread; ++i) {
+    pool.Schedule([&] {
+      TF_ASSERT_OK(
+          test_runner_.ExecuteReplicated(executable.get(), opts, &device_assn)
+              .status());
+      done.DecrementCount();
+    });
+  }
+  done.Wait();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 352b59f248b..fc0a4f541c6 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -527,32 +527,20 @@ XLA_TEST_P(ReduceWindowTest, Add128In128) {
 TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
   Array2D<float> input_array(14, 14, 1.0f);
   const auto input = CreateConstantFromArray(input_array, &builder_);
-
   int win_len = 3;
   int stride = 1;
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {win_len, win_len}, {stride, stride}, padding);
-
-  auto res = ReferenceUtil::ReduceWindow2DAdd(
-      input_array, 0.0f, {win_len, win_len}, {stride, stride}, padding);
-
-  ComputeAndCompareLiteral(&builder_, LiteralUtil::CreateFromArray<float>(*res),
-                           {}, DefaultErrorSpec());
+  ComputeAndCompare(&builder_, {}, DefaultErrorSpec());
 }
 
 TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   Array2D<float> input_array(6, 4, 1.0f);
   XlaOp input = Broadcast(
       CreateConstantFromLiteral(LiteralUtil::One(F32), &builder_), {6, 4});
-
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {4, 2}, {3, 3}, padding);
-
-  auto res = ReferenceUtil::ReduceWindow2DAdd(input_array, 0.0f, {4, 2}, {3, 3},
-                                              padding);
-
-  ComputeAndCompareLiteral(&builder_, LiteralUtil::CreateFromArray<float>(*res),
-                           {}, DefaultErrorSpec());
+  ComputeAndCompare(&builder_, {}, DefaultErrorSpec());
 }
 
 INSTANTIATE_TEST_CASE_P(ReduceWindowTestInstance, ReduceWindowTest,
@@ -1056,77 +1044,139 @@ struct R2ReduceWindowTestData {
   int64 base_bounds[2];
   int64 window_bounds[2];
   int64 strides[2];
+  int64 base_dilation[2];
+  int64 window_dilation[2];
   int64 pad_low[2];
   int64 pad_high[2];
   int64 layout[2];
   Reducer reducer;
 } kR2TestCases[] = {
     {/*base_bounds=*/{4, 18}, /*window_bounds=*/{2, 4},
-     /*strides=*/{1, 2}, /*pad_low=*/{0, 1}, /*pad_high=*/{1, 1},
+     /*strides=*/{1, 2},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 1}, /*pad_high=*/{1, 1},
      /*layout=*/{0, 1},
      /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{2, 5}, /*window_bounds=*/{2, 4},
-     /*strides=*/{1, 1}, /*pad_low=*/{0, 1}, /*pad_high=*/{1, 2},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 1}, /*pad_high=*/{1, 2},
      /*layout=*/{0, 1},
      /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{1, 3}, /*window_bounds=*/{2, 3},
-     /*strides=*/{1, 1}, /*pad_low=*/{0, 1}, /*pad_high=*/{1, 1},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 1}, /*pad_high=*/{1, 1},
      /*layout=*/{0, 1},
      /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{3, 129}, /*window_bounds=*/{1, 100},
-     /*strides=*/{2, 99}, /*pad_low=*/{0, 0}, /*pad_high=*/{35, 35},
+     /*strides=*/{2, 99},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{35, 35},
      /*layout=*/{0, 1},
      /*reducer=*/Reducer::kAdd},
 // TODO(b/74260408): This test last failed on GPU on 2018-03-08, likely due to a
 // ptxas bug.
 #ifndef XLA_TEST_BACKEND_GPU
     {/*base_bounds=*/{6, 152}, /*window_bounds=*/{2, 25},
-     /*strides=*/{5, 4}, /*pad_low=*/{0, 1}, /*pad_high=*/{10, 11},
+     /*strides=*/{5, 4},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 1}, /*pad_high=*/{10, 11},
      /*layout=*/{0, 1},
      /*reducer=*/Reducer::kAdd},
 #endif
     {/*base_bounds=*/{6, 4}, /*window_bounds=*/{4, 2},
-     /*strides=*/{3, 3}, /*pad_low=*/{0, 1}, /*pad_high=*/{0, 1},
+     /*strides=*/{3, 3},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 1}, /*pad_high=*/{0, 1},
      /*layout=*/{0, 1},
      /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{5, 147}, /*window_bounds=*/{1, 36},
-     /*strides=*/{4, 5}, /*pad_low=*/{0, 0}, /*pad_high=*/{17, 17},
+     /*strides=*/{4, 5},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{17, 17},
      /*layout=*/{1, 0},
      /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{4, 153}, /*window_bounds=*/{2, 93},
-     /*strides=*/{1, 1}, /*pad_low=*/{0, 1}, /*pad_high=*/{46, 46},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 1}, /*pad_high=*/{46, 46},
      /*layout=*/{1, 0},
      /*reducer=*/Reducer::kAdd},
     // Regression test for a bug that appeared in Inception (b/34784899).
     {/*base_bounds=*/{28, 28}, /*window_bounds=*/{3, 3},
-     /*strides=*/{1, 1}, /*pad_low=*/{1, 1}, /*pad_high=*/{1, 1},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{1, 1}, /*pad_high=*/{1, 1},
      /*layout=*/{1, 0},
      /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
-     /*strides=*/{1, 1}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0},
-     /*reducer=*/Reducer::kAdd},
+     /*reducer=*/Reducer::kMax},
+    {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{2, 2},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kMax},
+    {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{2, 2}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kMax},
+    {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
+     /*strides=*/{2, 2},
+     /*base_dilation=*/{2, 2}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kMax},
+    {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
+     /*strides=*/{2, 2},
+     /*base_dilation=*/{2, 2}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{3, 3}, /*pad_high=*/{3, 3},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kMax},
+    {/*base_bounds=*/{4, 4}, /*window_bounds=*/{2, 2},
+     /*strides=*/{2, 2},
+     /*base_dilation=*/{2, 2}, /*window_dilation=*/{2, 2},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*layout=*/{1, 0},
+     /*reducer=*/Reducer::kMax},
     // Regression test for a bug that appeared in Inception (b/34784899).
     {/*base_bounds=*/{4, 32}, /*window_bounds=*/{2, 2},
-     /*strides=*/{2, 2}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*strides=*/{2, 2},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0},
      /*reducer=*/Reducer::kAdd},
     // Regression test for b/73903312: bf16 lacks precision to store result of
     // very large windows. Testing with a reasonable window larger than 128.
     {/*base_bounds=*/{8, 130}, /*window_bounds=*/{1, 130},
-     /*strides=*/{1, 1}, /*pad_low=*/{0, 130}, /*pad_high=*/{0, 0},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 130}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0},
      /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{8, 256}, /*window_bounds=*/{1, 4},
-     /*strides=*/{1, 64}, /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
+     /*strides=*/{1, 64},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad_high=*/{0, 0},
      /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{4096, 4096}, /*window_bounds=*/{1, 4},
-     /*strides=*/{1, 1024}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
+     /*strides=*/{1, 1024},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
      /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
     // Regression test for b/72234705: bf16 lacks precision to store incremental
     // results on very large windows. Using smaller window with minor dim 128.
     {/*base_bounds=*/{8, 128}, /*window_bounds=*/{2, 128},
-     /*strides=*/{1, 1}, /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
+     /*strides=*/{1, 1},
+     /*base_dilation=*/{1, 1}, /*window_dilation=*/{1, 1},
+     /*pad_low=*/{0, 0}, /*pad-high=*/{0, 0},
      /*layout=*/{1, 0}, /*reducer=*/Reducer::kAdd},
 };
 
@@ -1135,9 +1185,11 @@ string R2ReduceWindowTestDataToString(
         ::testing::tuple<R2ReduceWindowTestData, bool>>& data) {
   const auto& param = ::testing::get<0>(data.param);
   string str = absl::StrCat(
-      "base_bounds_", absl::StrJoin(param.base_bounds, "x"),        //
-      "__window_bounds_", absl::StrJoin(param.window_bounds, "x"),  //
-      "__strides_", absl::StrJoin(param.strides, "x"),              //
+      "base_bounds_", absl::StrJoin(param.base_bounds, "x"),            //
+      "__window_bounds_", absl::StrJoin(param.window_bounds, "x"),      //
+      "__strides_", absl::StrJoin(param.strides, "x"),                  //
+      "__base_dilation_", absl::StrJoin(param.base_dilation, "x"),      //
+      "__window_dilation_", absl::StrJoin(param.window_dilation, "x"),  //
       "__pad_low_", absl::StrJoin(param.pad_low, "x"), "__pad_high_",
       absl::StrJoin(param.pad_high, "x"), "__layout_", param.layout[0], "_",
       param.layout[1],  //
@@ -1158,14 +1210,18 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
     XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
 
-    const float kInitValue = 0.0f;
     Array2D<float> input(param.base_bounds[0], param.base_bounds[1], 1.0f);
+    if (!::testing::get<1>(GetParam())) {
+      // We only do this in F32 mode, to avoid precision issues with BF16.
+      input = *MakeLinspaceArray2D(0, 100, param.base_bounds[0],
+                                   param.base_bounds[1]);
+    }
     Literal input_literal = LiteralUtil::CreateR2FromArray2DWithLayout(
         input, LayoutUtil::MakeLayout(param.layout));
 
     XlaOp parameter;
-    auto input_arg = CreateParameterAndTransferLiteral(0, input_literal, "p0",
-                                                       &b, &parameter);
+    CreateParameterAndTransferLiteral(0, input_literal, "p0", &b, &parameter);
+
     std::vector<std::pair<int64, int64>> padding(2);
     for (int i = 0; i < 2; ++i) {
       padding[i] = {param.pad_low[i], param.pad_high[i]};
@@ -1173,6 +1229,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
     auto computation = param.reducer == kAdd
                            ? CreateScalarAddComputation(FloatType(), &b)
                            : CreateScalarMaxComputation(FloatType(), &b);
+    const float kInitValue = 0.0f;
     auto init_value =
         CreateConstantFromLiteral(LiteralUtil::CreateR0(kInitValue), &b);
     ReduceWindowWithGeneralPadding(
@@ -1181,20 +1238,12 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         /*computation=*/computation,
         /*window_dimensions=*/param.window_bounds,
         /*window_strides=*/param.strides,
-        /*base_dilations=*/{},
-        /*window_dilations=*/{},
+        /*base_dilations=*/param.base_dilation,
+        /*window_dilations=*/param.window_dilation,
         /*padding=*/padding);
 
-    auto reduce_func = param.reducer == kAdd
-                           ? +[](float a, float b) { return a + b; }
-                           : +[](float a, float b) { return std::max(a, b); };
-    auto expected = ReferenceUtil::ReduceWindow2DGeneric(
-        /*operand=*/input, /*init=*/kInitValue, /*reduce_func=*/reduce_func,
-        /*window=*/param.window_bounds,
-        /*stride=*/param.strides, /*padding=*/padding);
-
-    ComputeAndCompareLiteral(&b, LiteralUtil::CreateFromArray(*expected),
-                             {input_arg.get()}, DefaultErrorSpec());
+    ComputeAndCompare(&b, {MaybeConvertLiteralToBfloat16(input_literal)},
+                      DefaultErrorSpec());
   }
 };
 
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index 4337aa4bf9a..1fa43c65445 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -258,7 +258,7 @@ XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsForGather) {
   auto module = ParseHloString(R"(
   HloModule Test
 
-ENTRY %module(paramater.0: f32[200,100,300], parameter.1: s32[10,2]) ->
+ENTRY %module(parameter.0: f32[200,100,300], parameter.1: s32[10,2]) ->
                                                           f32[10,300] {
   %parameter.0 = f32[200,100,300] parameter(0)
   %parameter.1 = s32[10,2] parameter(1)
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index 00b72cedbf5..697c24e6587 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -100,6 +100,28 @@ XLA_TEST_F(TransferManagerTest, TransferR1F32) {
                                         result);
 }
 
+XLA_TEST_F(TransferManagerTest, TransferR1F32AwkwardSizes) {
+  // Test transferring R1s from 0 to kMaxR1Size. The goal is to find bugs
+  // related to "awkwardly" sized R1s.
+  constexpr int kMaxR1Size = (1 << 11);
+  for (int i = 0; i < kMaxR1Size; ++i) {
+    std::vector<float> inputs(i);
+    std::iota(inputs.begin(), inputs.end(), 0);
+    Literal literal = LiteralUtil::CreateR1<float>(inputs);
+    const Shape& shape = literal.shape();
+    auto device_buffer = AllocateDeviceBuffer(shape);
+
+    // Round trip literal through device.
+    ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
+                                                            device_buffer));
+    TF_ASSERT_OK_AND_ASSIGN(
+        Literal result,
+        transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer));
+
+    LiteralTestUtil::ExpectR1Equal<float>(inputs, result);
+  }
+}
+
 XLA_TEST_F(TransferManagerTest, TransferR1LargeF32) {
   std::vector<float> test_vector(1024 * 1024);
   std::iota(test_vector.begin(), test_vector.end(), 0);
@@ -276,8 +298,8 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
 }
 
 XLA_TEST_F(TransferManagerTest, TransferTokenFromDevice) {
-  // "Copy" a token from the device. The token has no physical representation so
-  // no copying is actually performed, but it shouldn't fail.
+  // "Copy" a token from the device. The token has no physical representation
+  // so no copying is actually performed, but it shouldn't fail.
   // TODO(b/110532604): Add transferring the token to device when this is
   // supported.
   auto device_buffer = AllocateDeviceBuffer(ShapeUtil::MakeTokenShape());
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 4edd13c79c7..fe8e83512f4 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -1,8 +1,9 @@
 # Tools and utilities that aid in XLA development and usage.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow/compiler/xla:internal"])
+package(
+    default_visibility = ["//tensorflow/compiler/xla:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 257b1ef5c3d..411b305c6ab 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -89,6 +89,8 @@ struct Options {
   Options()
       : intra_op_thread_pool_size(tensorflow::port::NumSchedulableCPUs()) {}
 
+  bool NeedsRealData() const { return !use_fake_data && !compile_only; }
+
   string fake_infeed_shape;
   string fake_outfeed_shape;
 
@@ -106,6 +108,8 @@ struct Options {
   int num_runs = 1;
 
   int intra_op_thread_pool_size;
+
+  bool compile_only = false;
 };
 
 StatusOr<std::unique_ptr<LocalExecutable>> CompileExecutable(
@@ -355,9 +359,9 @@ StatusOr<std::vector<HloSnapshot>> ParseRecordIoFile(absl::string_view filename,
   CHECK(!snapshots.empty())
       << "No proto is successfully parsed from the file - the file possibly "
          "has a mismatched compression option, format, etc.";
-  CHECK(opts.use_fake_data)
-      << "Without --use_fake_data, you must pass an HloSnapshot -- HloProto "
-         "and textual HLO don't carry real data.";
+  CHECK(!opts.NeedsRealData())
+      << "Without --use_fake_data or --compile_only, you must pass an "
+         "HloSnapshot -- HloProto and textual HLO don't carry real data.";
   return snapshots;
 }
 
@@ -373,9 +377,9 @@ StatusOr<HloSnapshot> ParseSingleHloFile(const string& filename,
   if (s.code() == tensorflow::error::NOT_FOUND) {
     return s;
   }
-  CHECK(opts.use_fake_data)
-      << "Without --use_fake_data, you must pass an HloSnapshot -- HloProto "
-         "and textual HLO don't carry real data.";
+  CHECK(!opts.NeedsRealData())
+      << "Without --use_fake_data or --compile_only, you must pass an "
+         "HloSnapshot -- HloProto and textual HLO don't carry real data.";
   fprintf(stderr, "%s: is not HloSnapshot. Trying HloProto.\n",
           filename.c_str());
 
@@ -457,6 +461,11 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
       exit_status = EXIT_FAILURE;
       continue;
     }
+
+    if (opts.compile_only) {
+      continue;
+    }
+
     LocalExecutable* executable = executables[i].ValueOrDie().get();
     LOG(ERROR) << "Running iteration " << i;
     StatusOr<Literal> result_status =
@@ -518,6 +527,9 @@ int main(int argc, char** argv) {
                        &opts.intra_op_thread_pool_size,
                        "How many threads to use in the intra-op thread pool. "
                        "Defaults to the number of CPUs."),
+      tensorflow::Flag("compile_only", &opts.compile_only,
+                       "Whether the input should only be compiled, as opposed "
+                       "to compiled and executed."),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 55b092cfbaa..dacb5faa228 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -548,6 +548,78 @@ Status EraseElementFromVector(std::vector<T>* container, const T& value) {
   container->erase(it);
   return Status::OK();
 }
+
+// MakeCleanup(f) returns an RAII cleanup object that calls 'f' in its
+// destructor. The easiest way to use MakeCleanup is with a lambda argument,
+// capturing the return value in an 'auto' local variable. Most users will not
+// need more sophisticated syntax than that.
+//
+// Example:
+//   void func() {
+//     auto resource = acquire_resource();
+//     auto cleanup = MakeCleanup([&] { release_resource(resource); });
+//     TF_RETURN_IF_ERROR(...);  // phew, calls release_resource!
+//   }
+//
+// You can use Cleanup<F> directly, instead of using MakeCleanup and auto,
+// but there's rarely a reason to do that.
+//
+// You can call 'release()' on a Cleanup object to cancel the cleanup
+//
+// You probably do not want to capture by reference in the cleanup lambda a
+// variable that is returned by the function.  This can lead to disabling of RVO
+// at best, and undefined behavior at worst.
+template <typename F>
+class Cleanup {
+ public:
+  Cleanup() : released_(true), f_() {}
+
+  template <typename G>
+  explicit Cleanup(G&& f) : f_(std::forward<G>(f)) {}
+
+  Cleanup(Cleanup&& src) : released_(src.is_released()), f_(src.release()) {}
+
+  // Implicitly move-constructible from any compatible Cleanup<G>. The source
+  // will be released as if src.release() were called. A moved-from Cleanup can
+  // be safely destroyed or reassigned.
+  template <typename G>
+  Cleanup(Cleanup<G>&& src) : released_(src.is_released()), f_(src.release()) {}
+
+  // Assignment to a Cleanup object behaves like destroying it and making a new
+  // one in its place, analogous to unique_ptr semantics.
+  Cleanup& operator=(Cleanup&& src) {
+    if (!released_) std::move(f_)();
+    released_ = src.released_;
+    f_ = src.release();
+    return *this;
+  }
+
+  ~Cleanup() {
+    if (!released_) std::move(f_)();
+  }
+
+  // Releases the cleanup function instead of running it. Hint: use
+  // c.release()() to run early.
+  F release() {
+    released_ = true;
+    return std::move(f_);
+  }
+
+  bool is_released() const { return released_; }
+
+ private:
+  static_assert(!std::is_reference<F>::value, "F must not be a reference");
+
+  bool released_ = false;
+  F f_;
+};
+
+template <int&... ExplicitParameterBarrier, typename F,
+          typename DecayF = typename std::decay<F>::type>
+ABSL_MUST_USE_RESULT Cleanup<DecayF> MakeCleanup(F&& f) {
+  return Cleanup<DecayF>(std::forward<F>(f));
+}
+
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 67f76d00703..7dc87ae08b6 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -610,7 +610,7 @@ message OpSharding {
 // all-to-all).
 message ReplicaGroup {
   // The ids of the replicas that belongs to the same group. The ordering of the
-  // ids matters in some op (e.g., all-to-all).
+  // ids matters in some ops (e.g., all-to-all).
   repeated int64 replica_ids = 1;
 }
 
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index acd984f9e99..694e75a447d 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -44,12 +44,15 @@ cc_library(
     srcs = [
         "xrt_compilation_cache.cc",
         "xrt_device.cc",
+        "xrt_memory_manager.cc",
         "xrt_state.cc",
         "xrt_util.cc",
     ],
     hdrs = [
         "xrt_compilation_cache.h",
         "xrt_device.h",
+        "xrt_memory_manager.h",
+        "xrt_refptr.h",
         "xrt_state.h",
         "xrt_util.h",
     ],
diff --git a/tensorflow/compiler/xrt/cc/BUILD b/tensorflow/compiler/xrt/cc/BUILD
index 5c1e86b76b4..59a965945ad 100644
--- a/tensorflow/compiler/xrt/cc/BUILD
+++ b/tensorflow/compiler/xrt/cc/BUILD
@@ -1,7 +1,6 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 load(
diff --git a/tensorflow/compiler/xrt/client/BUILD b/tensorflow/compiler/xrt/client/BUILD
index 3908f026bcf..c06ae7fb1cb 100644
--- a/tensorflow/compiler/xrt/client/BUILD
+++ b/tensorflow/compiler/xrt/client/BUILD
@@ -65,6 +65,7 @@ cc_library(
         ":xrt_tf_client",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:computation_placer",
diff --git a/tensorflow/compiler/xrt/client/xrt_client.cc b/tensorflow/compiler/xrt/client/xrt_client.cc
index c1f06e91c4f..a91aba17650 100644
--- a/tensorflow/compiler/xrt/client/xrt_client.cc
+++ b/tensorflow/compiler/xrt/client/xrt_client.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xrt/client/xrt_client.h"
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xrt/client/xrt_tf_client.h"
@@ -71,8 +72,11 @@ xla::StatusOr<xla::Literal> DeserializeTensorProtoAsLiteral(
 
 }  // namespace
 
-XrtBuffer::XrtBuffer(XrtTensorHandle handle, xla::Shape shape)
-    : handle_(std::move(handle)), shape_(std::move(shape)) {}
+XrtBuffer::XrtBuffer(XrtTensorHandle handle, int xrt_device_ordinal,
+                     xla::Shape shape)
+    : handle_(std::move(handle)),
+      xrt_device_ordinal_(xrt_device_ordinal),
+      shape_(std::move(shape)) {}
 
 XrtBuffer::~XrtBuffer() { Delete(); }
 
@@ -100,17 +104,27 @@ XrtBuffer::~XrtBuffer() { Delete(); }
       "XRTAllocate", {&literal_handle}, /*output_arity=*/1, /*attrs=*/{},
       tf_device_id)[0]);
 
-  return std::make_shared<XrtBuffer>(std::move(buffer_handle), literal.shape());
+  return std::make_shared<XrtBuffer>(std::move(buffer_handle),
+                                     xrt_device_ordinal, literal.shape());
 }
 
 /*static*/ xla::StatusOr<std::shared_ptr<XrtBuffer>> XrtBuffer::MakeTuple(
     const std::shared_ptr<XrtContext>& context,
-    const std::vector<std::shared_ptr<XrtBuffer>>& elements) {
+    const std::vector<std::shared_ptr<XrtBuffer>>& elements,
+    int xrt_device_ordinal) {
   if (elements.empty()) {
-    return errors::Unimplemented(
-        "The arity zero case of MakeTuple is not implemented.");
+    // XRTMakeTuple cannot construct empty tuples. Construct via a literal
+    // instead.
+    return FromLiteral(context, xrt_device_ordinal,
+                       xla::LiteralUtil::MakeTuple({}));
   }
-  int tf_device_id = elements[0]->handle().device_id();
+
+  if (xrt_device_ordinal < 0 ||
+      xrt_device_ordinal >= context->tf_device_ids().size()) {
+    return errors::InvalidArgument("Invalid XRT device ordinal ",
+                                   xrt_device_ordinal);
+  }
+  int tf_device_id = context->tf_device_ids().at(xrt_device_ordinal);
   xrt::XLATupleNode tuple_description;
   std::vector<xla::Shape> element_shapes;
   element_shapes.reserve(elements.size());
@@ -144,7 +158,8 @@ XrtBuffer::~XrtBuffer() { Delete(); }
   XrtTensorHandle buffer_handle = std::move(context->tf_context()->EnqueueOp(
       "XRTMakeTuple", args, /*output_arity=*/1, attrs, tf_device_id)[0]);
   return std::make_shared<XrtBuffer>(
-      std::move(buffer_handle), xla::ShapeUtil::MakeTupleShape(element_shapes));
+      std::move(buffer_handle), xrt_device_ordinal,
+      xla::ShapeUtil::MakeTupleShape(element_shapes));
 }
 
 xla::StatusOr<xla::Literal> XrtBuffer::ToLiteral() const {
@@ -193,8 +208,8 @@ XrtBuffer::DestructureTuple() {
         handle_.context()->EnqueueOp("XRTSubTuple", {&handle_, &index},
                                      /*output_arity=*/1,
                                      /*attrs=*/{}, handle_.device_id())[0]);
-    output.push_back(
-        std::make_shared<XrtBuffer>(std::move(sub), shape_.tuple_shapes(i)));
+    output.push_back(std::make_shared<XrtBuffer>(
+        std::move(sub), xrt_device_ordinal_, shape_.tuple_shapes(i)));
   }
   return output;
 }
@@ -343,7 +358,8 @@ xla::StatusOr<std::shared_ptr<XrtBuffer>> XrtExecutable::Execute(
   XrtTensorHandle result_handle = std::move(handle_.context()->EnqueueOp(
       "XRTExecute", inputs, /*output_arity=*/1, attrs, tf_device_id)[0]);
 
-  return std::make_shared<XrtBuffer>(std::move(result_handle), shape_.result());
+  return std::make_shared<XrtBuffer>(std::move(result_handle),
+                                     xrt_device_ordinal, shape_.result());
 }
 
 xla::StatusOr<xla::Array2D<std::shared_ptr<XrtBuffer>>>
@@ -453,7 +469,7 @@ XrtExecutable::ExecuteReplicated(
 
       // TODO(phawkins): use a per-core result shape here.
       results(i, j) = std::make_shared<XrtBuffer>(
-          std::move(outputs[output_num]), shape_.result());
+          std::move(outputs[output_num]), xrt_device_ordinal, shape_.result());
       ++output_num;
     }
   }
diff --git a/tensorflow/compiler/xrt/client/xrt_client.h b/tensorflow/compiler/xrt/client/xrt_client.h
index c54f156e95b..fe0b650fb95 100644
--- a/tensorflow/compiler/xrt/client/xrt_client.h
+++ b/tensorflow/compiler/xrt/client/xrt_client.h
@@ -55,7 +55,8 @@ class XrtBuffer {
   // Builds a new XrtBuffer tuple from its constituent parts.
   static xla::StatusOr<std::shared_ptr<XrtBuffer>> MakeTuple(
       const std::shared_ptr<XrtContext>& context,
-      const std::vector<std::shared_ptr<XrtBuffer>>& elements);
+      const std::vector<std::shared_ptr<XrtBuffer>>& elements,
+      int xrt_device_ordinal);
 
   // Converts an XrtBuffer to an XLA literal, copying the buffer from the remote
   // host. Blocks until the buffer is available.
@@ -71,7 +72,7 @@ class XrtBuffer {
   // tensors and vice-versa for TF interoperability.
 
   XrtBuffer() = default;
-  XrtBuffer(XrtTensorHandle handle, xla::Shape shape);
+  XrtBuffer(XrtTensorHandle handle, int xrt_device_ordinal, xla::Shape shape);
   ~XrtBuffer();  // Calls Delete().
 
   // A buffer reference is moveable but not copyable.
@@ -81,11 +82,13 @@ class XrtBuffer {
   XrtBuffer& operator=(XrtBuffer&&) = default;
 
   const XrtTensorHandle& handle() const { return handle_; }
+  int xrt_device_ordinal() const { return xrt_device_ordinal_; }
   const xla::Shape& shape() const { return shape_; }
 
  private:
   // Tensor that contains the XRT allocation ID.
   XrtTensorHandle handle_;
+  int xrt_device_ordinal_;
   xla::Shape shape_;
 };
 
diff --git a/tensorflow/compiler/xrt/client/xrt_client_test.cc b/tensorflow/compiler/xrt/client/xrt_client_test.cc
index e64c986f44e..d9e94b01d2c 100644
--- a/tensorflow/compiler/xrt/client/xrt_client_test.cc
+++ b/tensorflow/compiler/xrt/client/xrt_client_test.cc
@@ -302,6 +302,22 @@ TEST_F(XrtClientTest, TupleDestructuringAndDelete) {
   pieces[1]->Delete();
 }
 
+TEST_F(XrtClientTest, EmptyTuples) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<XrtContext> context, MakeContext());
+
+  // Tests sending a literal to and from the device.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<XrtBuffer> buffer,
+      XrtBuffer::MakeTuple(context, /*elements=*/{}, /*xrt_device_ordinal=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<std::shared_ptr<XrtBuffer>> pieces,
+                          buffer->DestructureTuple());
+  EXPECT_EQ(pieces.size(), 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal out, buffer->ToLiteral());
+  ASSERT_TRUE(out.shape().IsTuple());
+  EXPECT_EQ(out.shape().tuple_shapes_size(), 0);
+}
+
 TEST_F(XrtClientTest, TupleConstructionAndDestructuring) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<XrtContext> context, MakeContext());
 
@@ -326,8 +342,9 @@ TEST_F(XrtClientTest, TupleConstructionAndDestructuring) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(b, b_in));
 
   std::vector<std::shared_ptr<XrtBuffer>> elems = {a_buffer, b_buffer};
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<XrtBuffer> buffer,
-                          XrtBuffer::MakeTuple(context, elems));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<XrtBuffer> buffer,
+      XrtBuffer::MakeTuple(context, elems, /*xrt_device_ordinal=*/0));
   TF_ASSERT_OK_AND_ASSIGN(std::vector<std::shared_ptr<XrtBuffer>> pieces,
                           buffer->DestructureTuple());
 
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index d89dc4642be..231387e314f 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
+#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
 #include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -72,28 +73,31 @@ uint32 GetXLARandomSeed() {
 }
 
 xla::StatusOr<InputBuffers> GetInputBuffers(
-    ResourceMgr* rm, const std::vector<InputCoords>& input_coords,
-    bool release_inputs) {
+    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
+    const std::vector<InputCoords>& input_coords, bool release_inputs) {
   InputBuffers input_buffers;
   input_buffers.input_tuples.reserve(input_coords.size());
   input_buffers.input_allocations.reserve(input_coords.size());
   input_buffers.input_pointers.reserve(input_coords.size());
   for (size_t i = 0; i < input_coords.size(); ++i) {
-    XRTTupleAllocation* tuple;
     TF_RETURN_IF_ERROR(
-        XRTTupleAllocation::Lookup(rm, input_coords[i].handle, &tuple));
+        working_set->LookupAndPin(backend, input_coords[i].handle));
+    auto tuple = working_set->PinnedTuples().back();
     input_buffers.input_tuples.emplace_back(tuple);
     if (release_inputs) {
       // We are holding a reference to the tuple, so we can safely delete it
       // from the resource manager here.
-      TF_RETURN_IF_ERROR(XRTTupleAllocation::DeleteFromResourceManager(
-          rm, input_coords[i].handle));
+      TF_RETURN_IF_ERROR(
+          working_set->MemoryManager()->Release(input_coords[i].handle));
       VLOG(2) << "Released allocation handle " << input_coords[i].handle;
     }
     if (input_coords[i].index.empty()) {
-      input_buffers.input_allocations.emplace_back(tuple->ToShapedBuffer());
+      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
+                          tuple->ToShapedBuffer());
+      input_buffers.input_allocations.emplace_back(std::move(shaped_buffer));
     } else {
-      xla::ShapedBuffer shaped_buffer = tuple->ToShapedBuffer();
+      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
+                          tuple->ToShapedBuffer());
       TF_ASSIGN_OR_RETURN(xla::ShapedBuffer sub_shaped_buffer,
                           shaped_buffer.SubShapedBuffer(input_coords[i].index));
       input_buffers.input_allocations.emplace_back(
@@ -107,28 +111,25 @@ xla::StatusOr<InputBuffers> GetInputBuffers(
 }
 
 xla::StatusOr<InputBuffers> GetChainedOpInputs(
-    const xrt::XRTChainedExecuteOp& op, int current_index,
-    absl::Span<const RefPtr<XRTTupleAllocation>> ops_outputs) {
+    const xrt::XRTChainedExecuteOp& op,
+    absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs) {
   InputBuffers input_buffers;
   input_buffers.input_tuples.reserve(op.inputs_size());
   input_buffers.input_allocations.reserve(op.inputs_size());
   input_buffers.input_pointers.reserve(op.inputs_size());
-  for (auto& input : op.inputs()) {
-    if (input.op_index() >= current_index) {
-      return errors::InvalidArgument(
-          "Input index ", input.op_index(),
-          " is above the current position: ", current_index);
-    }
-    input_buffers.input_tuples.emplace_back(ops_outputs[input.op_index()]);
+  for (int i = 0; i < op.inputs_size(); ++i) {
+    auto& input = op.inputs(i);
+    input_buffers.input_tuples.emplace_back(op_inputs[i]);
     // Thanks to the greatness of proto3, there is no way to query for
     // explicitly set fields, so the default for output_index (zero) means no
     // sub-index. As consequence, the real index is output_index - 1.
     if (input.output_index() == 0) {
-      input_buffers.input_allocations.emplace_back(
-          input_buffers.input_tuples.back()->ToShapedBuffer());
+      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
+                          input_buffers.input_tuples.back()->ToShapedBuffer());
+      input_buffers.input_allocations.emplace_back(std::move(shaped_buffer));
     } else {
-      xla::ShapedBuffer shaped_buffer =
-          input_buffers.input_tuples.back()->ToShapedBuffer();
+      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
+                          input_buffers.input_tuples.back()->ToShapedBuffer());
       TF_ASSIGN_OR_RETURN(
           xla::ShapedBuffer sub_shaped_buffer,
           shaped_buffer.SubShapedBuffer({input.output_index() - 1}));
@@ -142,7 +143,7 @@ xla::StatusOr<InputBuffers> GetChainedOpInputs(
   return std::move(input_buffers);
 }
 
-xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
+xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
     se::Stream* stream, int rng_seed) {
@@ -190,15 +191,35 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
 }
 
 xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
-    OpKernelContext* context, ResourceMgr* rm,
+    OpKernelContext* context, XRTMemoryManager* memory_manager,
+    XRTGenericDeviceAccessor::ScopedRef* device_ref,
+    xla::LocalExecutable* executable, const InputBuffers& input_buffers,
+    se::Stream* stream, int rng_seed) {
+  auto runfn = [&]() {
+    return RunExecutable(context, device_ref, executable, input_buffers, stream,
+                         rng_seed);
+  };
+
+  // We pass zero as requested_free_size as there is no simple way to get the
+  // peak heap size. Upon zero, the Run() API will try to free chunks of device
+  // memory, until either the runfn can run, or we run out of freeable memory.
+  return memory_manager->Run<RefPtr<XRTTupleAllocation>>(
+      runfn, device_ref->backend(), device_ref->device_ordinal(),
+      /*requested_free_size=*/0);
+}
+
+xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
+    OpKernelContext* context, const RefPtr<XRTMemoryManager>& memory_manager,
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable,
     const std::vector<InputCoords>& input_coords, bool release_inputs,
     se::Stream* stream, int rng_seed) {
+  XRTMemoryManager::WorkingSet working_set(memory_manager);
   TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
-                      GetInputBuffers(rm, input_coords, release_inputs));
-  return ExecuteComputation(context, device_ref, executable, input_buffers,
-                            stream, rng_seed);
+                      GetInputBuffers(&working_set, device_ref->backend(),
+                                      input_coords, release_inputs));
+  return ExecuteComputation(context, memory_manager.get(), device_ref,
+                            executable, input_buffers, stream, rng_seed);
 }
 
 // XRTExecuteOp
@@ -265,8 +286,9 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   se::Stream* stream = context->op_device_context()
                            ? context->op_device_context()->stream()
                            : nullptr;
+  RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
   TF_ASSIGN_OR_RETURN(std::vector<InputCoords> input_coords,
-                      GetComputationInputs(context, rm, "input_handles"));
+                      GetComputationInputs(context, "input_handles"));
 
   std::unique_ptr<XRTCompilationCacheEntryRef> entry;
   TF_RETURN_IF_ERROR(cache->Lookup(compilation_handle, &entry));
@@ -279,10 +301,11 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
 
   TF_ASSIGN_OR_RETURN(
       RefPtr<XRTTupleAllocation> output_tuple,
-      ExecuteComputation(context, rm, &device_ref, executable, input_coords,
-                         release_inputs, stream, rng_seed));
+      ExecuteComputation(context, memory_manager, &device_ref, executable,
+                         input_coords, release_inputs, stream, rng_seed));
 
-  return CreateExecuteOutput(context, rm, std::move(output_tuple),
+  return CreateExecuteOutput(context, memory_manager.get(),
+                             std::move(output_tuple),
                              config_proto.return_exploded_tuple());
 }
 
@@ -346,22 +369,23 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   se::Stream* stream = context->op_device_context()
                            ? context->op_device_context()->stream()
                            : nullptr;
-  auto execute_op =
-      [&](const xrt::XRTChainedExecuteOp& op, int current_index,
-          absl::Span<const RefPtr<XRTTupleAllocation>> ops_outputs)
+  RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
+  auto execute_op = [&](const xrt::XRTChainedExecuteOp& op,
+                        absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs)
       -> xla::StatusOr<RefPtr<XRTTupleAllocation>> {
     TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
-                        GetChainedOpInputs(op, current_index, ops_outputs));
+                        GetChainedOpInputs(op, op_inputs));
 
     std::unique_ptr<XRTCompilationCacheEntryRef> entry;
     TF_RETURN_IF_ERROR(cache->Lookup(op.computation_handle(), &entry));
     xla::LocalExecutable* executable = entry->get().get_executable();
 
-    return ExecuteComputation(context, &device_ref, executable, input_buffers,
-                              stream, rng_seed);
+    return ExecuteComputation(context, memory_manager.get(), &device_ref,
+                              executable, input_buffers, stream, rng_seed);
   };
 
-  return ExecuteChained(context, rm, plan, config, execute_op);
+  return ExecuteChained(context, memory_manager, device_ref.backend(),
+                        device_ref.device_ordinal(), plan, config, execute_op);
 }
 
 XRTExecuteChainedOp::~XRTExecuteChainedOp() = default;
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 8a54e0987e5..c3511b1d5d4 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
+#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -103,8 +104,8 @@ class XRTStateHelpers {
         TF_RET_CHECK(
             TensorShapeUtils::IsScalar(input_tensor_list[input_index].shape()));
         int64 key = input_tensor_list[input_index].scalar<int64>()();
-        TF_RETURN_IF_ERROR(
-            XRTTupleAllocation::Lookup(rm, key, &input.allocation));
+        TF_ASSIGN_OR_RETURN(input.allocation,
+                            XRTMemoryManager::Get(rm)->Lookup(key));
         input.release_allocation_after_use = release_this_input;
       }
     }
@@ -192,17 +193,14 @@ class XRTAllocateOp : public OpKernel {
     class DeviceAccessor::ScopedRef device_ref;
     OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
 
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
     XRTTupleAllocation* allocation;
     OP_REQUIRES_OK(ctx, XRTTupleAllocation::CreateAndTransfer(
-                            literal, device_ref.backend(),
+                            literal, memory_manager.get(), device_ref.backend(),
                             device_ref.device_ordinal(), &allocation));
 
-    // Intern takes ownership of our reference to allocation.
-    int64 key;
-    OP_REQUIRES_OK(ctx, allocation->Intern(rm, &key));
-
     Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64>()() = key;
+    output.scalar<int64>()() = memory_manager->Register(allocation);
     ctx->set_output(0, output);
   }
 };
@@ -291,17 +289,14 @@ class XRTAllocateFromTensorOp : public OpKernel {
     class DeviceAccessor::ScopedRef device_ref;
     OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
 
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
     XRTTupleAllocation* allocation;
     OP_REQUIRES_OK(ctx, XRTTupleAllocation::CreateAndTransfer(
-                            literal, device_ref.backend(),
+                            literal, memory_manager.get(), device_ref.backend(),
                             device_ref.device_ordinal(), &allocation));
 
-    // Intern takes ownership of our reference to allocation.
-    int64 key;
-    OP_REQUIRES_OK(ctx, allocation->Intern(rm, &key));
-
     Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64>()() = key;
+    output.scalar<int64>()() = memory_manager->Register(allocation);
     ctx->set_output(0, output);
   }
 
@@ -342,28 +337,22 @@ class XRTSubTupleOp : public OpKernel {
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
-    XRTTupleAllocation* allocation;
-    OP_REQUIRES_OK(
-        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
-    core::ScopedUnref allocation_unref(allocation);
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
+    RefPtr<XRTTupleAllocation> allocation;
+    OP_REQUIRES_OK(ctx, memory_manager->Lookup(allocation_handle, &allocation));
 
     if (discard_) {
       VLOG(2) << "Releasing handle " << allocation_handle;
-      OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(
-                              rm, allocation_handle));
+      OP_REQUIRES_OK(ctx, memory_manager->Release(allocation_handle));
     }
 
     XRTTupleAllocation* suballocation;
     OP_REQUIRES_OK(
-        ctx, XRTTupleAllocation::MakeSubBuffer(allocation, shape_index,
+        ctx, XRTTupleAllocation::MakeSubBuffer(allocation.get(), shape_index,
                                                &suballocation, !discard_));
 
-    // Intern takes ownership of our reference to suballocation.
-    int64 key;
-    OP_REQUIRES_OK(ctx, suballocation->Intern(rm, &key));
-
     Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64>()() = key;
+    output.scalar<int64>()() = memory_manager->Register(suballocation);
     ctx->set_output(0, output);
   }
 };
@@ -398,14 +387,6 @@ class XRTMakeTupleOp : public OpKernel {
     // exit.
     std::vector<XRTTupleAllocation::ExpandedTupleInput> input_vector(
         arg_list.size());
-    auto cleanup = gtl::MakeCleanup([&input_vector] {
-      for (auto& input : input_vector) {
-        if (input.allocation != nullptr) {
-          input.allocation->Unref();
-        }
-      }
-    });
-
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
@@ -425,28 +406,22 @@ class XRTMakeTupleOp : public OpKernel {
     OP_REQUIRES_OK(
         ctx, DeviceAccessor::InitScopedRef(ctx, device_ordinal, &device_ref));
 
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
     XRTTupleAllocation* output_allocation;
     OP_REQUIRES_OK(ctx, XRTTupleAllocation::MakeTuple(
-                            device_ref.backend(), device_ref.device_ordinal(),
-                            tuple_shape_tree, &output_allocation));
-    // Add a ScopedUnref to simplify the error path while calling
-    // DeleteFromResourceManager.
-    core::ScopedUnref unref(output_allocation);
+                            memory_manager.get(), device_ref.backend(),
+                            device_ref.device_ordinal(), tuple_shape_tree,
+                            &output_allocation));
+    RefPtr<XRTTupleAllocation> output_ptr(output_allocation);
     for (int i = 0; i < input_vector.size(); ++i) {
       if (input_vector[i].release_allocation_after_use) {
-        OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(
-                                rm, arg_list[i].scalar<int64>()()));
+        OP_REQUIRES_OK(ctx,
+                       memory_manager->Release(arg_list[i].scalar<int64>()()));
       }
     }
 
-    // Intern takes ownership of a reference to output_allocation, so add
-    // another since the ScopedUnref will release one when this method exits.
-    output_allocation->Ref();
-    int64 key;
-    OP_REQUIRES_OK(ctx, output_allocation->Intern(rm, &key));
-
     Tensor output(DT_INT64, TensorShape({}));
-    output.scalar<int64>()() = key;
+    output.scalar<int64>()() = memory_manager->Register(std::move(output_ptr));
     ctx->set_output(0, output);
   }
 };
@@ -473,15 +448,13 @@ class XRTReadLiteralOp : public OpKernel {
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
-    XRTTupleAllocation* allocation;
-    OP_REQUIRES_OK(
-        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
-    core::ScopedUnref allocation_unref(allocation);
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
+    RefPtr<XRTTupleAllocation> allocation;
+    OP_REQUIRES_OK(ctx, memory_manager->Lookup(allocation_handle, &allocation));
 
     if (discard_) {
       VLOG(2) << "Releasing handle " << allocation_handle;
-      OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(
-                              rm, allocation_handle));
+      OP_REQUIRES_OK(ctx, memory_manager->Release(allocation_handle));
     }
 
     // We are guaranteed that the underlying device object won't be deleted out
@@ -491,9 +464,7 @@ class XRTReadLiteralOp : public OpKernel {
                             ctx, allocation->device_ordinal(), &device_ref));
 
     xla::Literal literal(allocation->on_host_shape());
-    OP_REQUIRES_OK(
-        ctx, allocation->ToLiteral(device_ref.backend(),
-                                   device_ref.device_ordinal(), &literal));
+    OP_REQUIRES_OK(ctx, allocation->ToLiteral(device_ref.backend(), &literal));
     xla::LiteralProto literal_proto = literal.ToProto();
 
     Tensor output(DT_STRING, TensorShape({}));
@@ -529,15 +500,13 @@ class XRTReadToTensorOp : public OpKernel {
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
-    XRTTupleAllocation* allocation;
-    OP_REQUIRES_OK(
-        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
-    core::ScopedUnref allocation_unref(allocation);
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
+    RefPtr<XRTTupleAllocation> allocation;
+    OP_REQUIRES_OK(ctx, memory_manager->Lookup(allocation_handle, &allocation));
 
     if (discard_) {
       VLOG(2) << "Releasing handle " << allocation_handle;
-      OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(
-                              rm, allocation_handle));
+      OP_REQUIRES_OK(ctx, memory_manager->Release(allocation_handle));
     }
 
     // We are guaranteed that the underlying device object won't be deleted out
@@ -573,15 +542,14 @@ class XRTReadToTensorOp : public OpKernel {
 
           XRTTupleAllocation* sub;
           TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
-              allocation, index, &sub, /*alias_parent_allocation=*/true));
+              allocation.get(), index, &sub, /*alias_parent_allocation=*/true));
           core::ScopedUnref sub_unref(sub);
 
           xla::MutableBorrowingLiteral literal;
           TF_RETURN_IF_ERROR(HostTensorToMutableBorrowingLiteral(
               xla::LayoutUtil::GetWithDefaultLayout(*subshape), output_tensor,
               &literal));
-          TF_RETURN_IF_ERROR(sub->ToLiteral(
-              device_ref.backend(), device_ref.device_ordinal(), &literal));
+          TF_RETURN_IF_ERROR(sub->ToLiteral(device_ref.backend(), &literal));
 
           ++output;
           return Status::OK();
@@ -624,10 +592,10 @@ class XRTWriteLiteralOp : public OpKernel {
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
-    XRTTupleAllocation* allocation;
-    OP_REQUIRES_OK(
-        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
-    core::ScopedUnref allocation_unref(allocation);
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
+    RefPtr<XRTTupleAllocation> allocation;
+    OP_REQUIRES_OK(ctx, memory_manager->Lookup(allocation_handle, &allocation));
+
     // We are guaranteed that the underlying device object won't be deleted out
     // from under us, while the ScopedRef is live.
     typename DeviceAccessor::ScopedRef device_ref;
@@ -657,12 +625,12 @@ class XRTReleaseAllocationOp : public OpKernel {
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
     const Tensor& allocation_handle = ctx->input(0);
     auto flat_keys = allocation_handle.flat<int64>();
     for (int64 i = 0; i < flat_keys.size(); ++i) {
       int64 key = flat_keys(i);
-      OP_REQUIRES_OK(ctx,
-                     XRTTupleAllocation::DeleteFromResourceManager(rm, key));
+      OP_REQUIRES_OK(ctx, memory_manager->Release(key));
       VLOG(2) << "Released allocation handle " << key;
     }
   }
@@ -684,7 +652,7 @@ class XRTReleaseAllAllocationsOp : public OpKernel {
 
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
-    OP_REQUIRES_OK(ctx, XRTTupleAllocation::ReleaseAllAllocations(rm));
+    XRTMemoryManager::Get(rm)->ReleaseAllAllocations();
   }
 };
 
@@ -701,11 +669,11 @@ class XRTCompactAllocationsOp : public OpKernel {
 
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+    RefPtr<XRTMemoryManager> memory_manager = XRTMemoryManager::Get(rm);
     class DeviceAccessor::ScopedRef device_ref;
     OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
-    OP_REQUIRES_OK(ctx,
-                   XRTTupleAllocation::CompactAllocations(
-                       rm, device_ref.backend(), device_ref.device_ordinal()));
+    OP_REQUIRES_OK(ctx, memory_manager->CompactAllocations(
+                            device_ref.backend(), device_ref.device_ordinal()));
   }
 };
 
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index 3a19327e5b5..f8341e1ee0f 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -1,13 +1,12 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow/compiler:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_cc_test")
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -34,6 +33,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt:xrt_server",
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 305b3a67fae..b5108acff16 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -47,6 +49,15 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+class XrtClientSession : public ClientSession {
+ public:
+  explicit XrtClientSession(const Scope& scope) : ClientSession(scope) {
+    auto clear_all = ops::XRTReleaseAllAllocations(scope);
+    std::vector<Tensor> outputs;
+    TF_CHECK_OK(Run(ClientSession::FeedType(), {}, {clear_all}, &outputs));
+  }
+};
+
 string* xla_test_device_ptr;  // initial value set in main()
 string* xla_platform_ptr;     // initial value set in main()
 
@@ -235,6 +246,26 @@ xla::XlaComputation AddAndSubTuple() {
   return builder.Build().ValueOrDie();
 }
 
+xla::XlaComputation BroadcastComputation(
+    const xla::Shape& shape, absl::Span<const xla::int64> dimensions) {
+  xla::XlaBuilder builder("BroadcastComputation");
+  auto p0 = xla::Parameter(&builder, 0, shape, "P0");
+  xla::Broadcast(p0, dimensions);
+  return builder.Build().ValueOrDie();
+}
+
+xla::XlaComputation IsEqualComputation(const xla::Shape& shape) {
+  xla::XlaBuilder builder("IsEqualComputation");
+  auto p0 = xla::Parameter(&builder, 0, shape, "P0");
+  auto p1 = xla::Parameter(&builder, 1, shape, "P1");
+  auto cmp =
+      xla::Ne(xla::Sub(p0, p1), xla::Zero(&builder, shape.element_type()));
+  auto icmp = xla::ConvertElementType(cmp, xla::S32);
+  xla::ReduceAll(icmp, xla::Zero(&builder, xla::S32),
+                 xla::CreateScalarAddComputation(xla::S32, &builder));
+  return builder.Build().ValueOrDie();
+}
+
 void StoreComputationSnapshot(const xla::XlaComputation& computation,
                               xla::HloSnapshot* dst) {
   auto snapshot = computation.Snapshot().ValueOrDie();
@@ -279,7 +310,7 @@ TEST(RawApiTest, AllocFromTensor) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
   EXPECT_EQ(outputs.size(), 1);
@@ -310,7 +341,7 @@ TEST(RawApiTest, AllocFromTensorTuple) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
   EXPECT_EQ(outputs.size(), 1);
@@ -336,7 +367,7 @@ TEST(RawApiTest, AllocFromTensorTupleSingle) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
   EXPECT_EQ(outputs.size(), 1);
@@ -362,7 +393,7 @@ TEST(RawApiTest, AllocFromTensorRelayout) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
   EXPECT_EQ(outputs.size(), 1);
@@ -389,7 +420,7 @@ TEST(RawApiTest, AllocAndRewrite) {
   auto read_back = ops::XRTReadLiteral(root, handle);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back, handle}, &outputs));
   EXPECT_EQ(outputs.size(), 2);
@@ -442,7 +473,7 @@ TEST(RawApiTest, AllocReleaseMany) {
   auto handle2 = ops::XRTAllocate(root, value2);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({handle1, handle2}, &outputs));
   EXPECT_EQ(outputs.size(), 2);
@@ -491,7 +522,7 @@ TEST(RawApiTest, CompileAndReleaseMany) {
   auto c_handle2 = ops::XRTCompile(root, computation2);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({c_handle1.handle, c_handle2.handle}, &outputs));
   EXPECT_EQ(outputs.size(), 2);
@@ -518,7 +549,7 @@ TEST(RawApiTest, AllocAndClearAll) {
   auto handle = ops::XRTAllocate(root, value);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({handle}, &outputs));
   EXPECT_EQ(outputs.size(), 1);
@@ -549,7 +580,7 @@ TEST(RawApiTest, ReadAndWriteState) {
       root.WithControlDependencies(read_back), handle);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(
       session.Run(ClientSession::FeedType(), {read_back}, {release}, &outputs));
@@ -571,7 +602,7 @@ TEST(RawApiTest, ReadAndWriteStateAutoFree) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
@@ -602,7 +633,7 @@ TEST(RawApiTest, SubBuffer) {
   auto value_00 = ops::XRTReadLiteralAndRelease(root, sub_00);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({value_0, value_1, value_00}, &outputs));
 
@@ -678,7 +709,7 @@ TEST(RawApiTest, MakeTuple) {
   auto res_1 = ops::XRTReadLiteralAndRelease(root, handle_4);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({res_0, res_1}, &outputs));
   xla::LiteralProto response_0;
@@ -718,7 +749,7 @@ TEST(RawApiTest, ExecuteChainedOpByOp) {
       root, ops::Const(root.WithDevice("/device:CPU:0"), c_sub_scale));
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(
       session.Run({c_add_scale_op.handle, c_sub_scale_op.handle}, &outputs));
@@ -788,7 +819,7 @@ TEST(RawApiTest, ExecuteChained) {
       root, ops::Const(root.WithDevice("/device:CPU:0"), c_sub_scale));
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(
       session.Run({c_add_scale_op.handle, c_sub_scale_op.handle}, &outputs));
@@ -920,7 +951,7 @@ TEST(RawApiTest, CompileAndExecute) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
 
@@ -975,7 +1006,7 @@ TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
 
@@ -1025,7 +1056,7 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
   auto release = ops::XRTReleaseCompilationHandle(root, c_handle.handle);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run(ClientSession::FeedType(), {c_handle.program_shape},
                            {release}, &outputs));
@@ -1094,7 +1125,7 @@ TEST(RawApiTest, DotGeneralWithLayoutTest) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
@@ -1129,7 +1160,7 @@ TEST(RawApiTest, CompileAndExecuteZeroArg) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
@@ -1179,7 +1210,7 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back}, &outputs));
 
@@ -1230,7 +1261,7 @@ TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) {
                                 {Output(p0_handle), Output(p1_handle)});
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({result}, &outputs));
   EXPECT_EQ(outputs.size(), 1);
@@ -1272,7 +1303,7 @@ TEST(RawApiTest, LeakCompilationReference) {
   auto c_handle = ops::XRTCompile(root, computation);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({c_handle.handle}, &outputs));
 }
@@ -1316,7 +1347,7 @@ TEST(RawApiTest, CompileAndExecuteWithReusedBuffers) {
   e.set_release_compilation_handle(true);
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
-  ClientSession session(root);
+  XrtClientSession session(root);
   auto e_config =
       ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
   auto c_data =
@@ -1412,7 +1443,7 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
 
@@ -1444,7 +1475,7 @@ TEST(RawApiTest, TestDeviceMemoryCompaction) {
   }
   TF_ASSERT_OK(root.status());
 
-  ClientSession session(root);
+  XrtClientSession session(root);
   std::vector<Tensor> outputs;
   TF_EXPECT_OK(session.Run(handle_outputs, &outputs));
   EXPECT_EQ(outputs.size(), handle_outputs.size());
@@ -1488,6 +1519,95 @@ TEST(RawApiTest, TestDeviceMemoryCompaction) {
   }
 }
 
+TEST(RawApiTest, TestDeviceMemorySwap) {
+  const xla::Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  // 100MB F32 tensor.
+  const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {5000, 5000});
+  const xla::int64 tensor_size = xla::ShapeUtil::ByteSizeOf(shape);
+  // On CPU we cannot trigger OOM/swap. For TPU and GPU we select 16GB as
+  // maximum memory.
+  xla::int64 device_memory_size = 8LL * 1024 * 1024 * 1024;
+  if (*xla_test_device_ptr == "TPU" || *xla_test_device_ptr == "XLA_GPU") {
+    device_memory_size = 16LL * 1024 * 1024 * 1024;
+  }
+
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = xla::LiteralUtil::CreateR0<float>(0.90434).ToProto();
+
+  // Create a computation which broadcasts a scalar to a big tensor.
+  xrt::XLAComputation c_bcast;
+  {
+    auto shapes = c_bcast.mutable_config()->mutable_program_shape();
+    *shapes->add_parameters() = scalar_shape.ToProto();
+    *shapes->mutable_result() = shape.ToProto();
+    StoreComputationSnapshot(
+        BroadcastComputation(scalar_shape, shape.dimensions()),
+        c_bcast.mutable_hlo_snapshot());
+  }
+
+  // Create a computation which compares two tensors.
+  xrt::XLAComputation c_equal;
+  {
+    auto shapes = c_equal.mutable_config()->mutable_program_shape();
+    *shapes->add_parameters() = shape.ToProto();
+    *shapes->add_parameters() = shape.ToProto();
+    *shapes->mutable_result() =
+        xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+    StoreComputationSnapshot(IsEqualComputation(shape),
+                             c_equal.mutable_hlo_snapshot());
+  }
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(false);
+  e.set_release_compilation_handle(false);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  XrtClientSession session(root);
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto bcast_computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c_bcast.SerializeAsString());
+  auto c_bcast_handle = ops::XRTCompile(root, bcast_computation);
+  auto equal_computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c_equal.SerializeAsString());
+  auto c_equal_handle = ops::XRTCompile(root, equal_computation);
+  auto p0_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  std::vector<Tensor> outputs;
+  std::vector<xla::int64> device_handles;
+
+  // Create more data the device can take using the broadcast computation.
+  xla::int64 num_tensors = 8 + device_memory_size / tensor_size;
+  for (xla::int64 i = 0; i < num_tensors; ++i) {
+    auto result = ops::XRTExecute(root, c_bcast_handle.handle, e_config,
+                                  {Output(p0_handle)});
+    TF_ASSERT_OK(root.status());
+    TF_ASSERT_OK(session.Run({result}, &outputs));
+    EXPECT_EQ(outputs.size(), 1);
+    device_handles.push_back(outputs[0].scalar<int64>()());
+  }
+
+  // Trigger computations on XRT handles to verify the swap-out/swap-in logic,
+  // by comparing sequential couple of tensors.
+  auto zero_literal = xla::LiteralUtil::CreateR0<xla::int32>(0);
+  for (size_t i = 0; i + 1 < device_handles.size(); ++i) {
+    auto exec_op = ops::XRTExecute(
+        root, c_equal_handle.handle, e_config,
+        {Input(device_handles[i]), Input(device_handles[i + 1])});
+    auto read_back = ops::XRTReadLiteral(root, exec_op);
+
+    TF_ASSERT_OK(root.status());
+    TF_ASSERT_OK(session.Run({read_back}, &outputs));
+    EXPECT_EQ(outputs.size(), 1);
+
+    xla::LiteralProto response;
+    EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+    auto literal = xla::Literal::CreateFromProto(response).ValueOrDie();
+    EXPECT_EQ(literal, zero_literal);
+  }
+}
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_memory_manager.cc b/tensorflow/compiler/xrt/xrt_memory_manager.cc
new file mode 100644
index 00000000000..3a304764800
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_memory_manager.cc
@@ -0,0 +1,353 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
+
+#include <algorithm>
+#include <list>
+#include <unordered_map>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+namespace {
+
+// We use kDeviceBits to store the device ordinal in the handle. We store the
+// device in the upper part of the int64 handle to make sure the random bits are
+// in the lower part which is better when storing the handle as a key for
+// unordered maps.
+const int kDeviceBits = 12;
+
+int64 MakeDeviceHandle(int64 device_ordinal, int64 rnd_value) {
+  const int64 kUidMask = (static_cast<int64>(1) << (64 - kDeviceBits)) - 1;
+  return (device_ordinal << (64 - kDeviceBits)) | (rnd_value & kUidMask);
+}
+
+int GetDeviceFromHandle(int64 handle) {
+  return (handle >> (64 - kDeviceBits)) & ((1 << kDeviceBits) - 1);
+}
+
+}  // namespace
+
+class XRTMemoryManager::DeviceContext {
+  struct Alloc {
+    explicit Alloc(RefPtr<XRTTupleAllocation> tuple)
+        : tuple(std::move(tuple)) {}
+
+    RefPtr<XRTTupleAllocation> tuple;
+  };
+
+  using AllocList = std::list<Alloc>;
+
+ public:
+  int64 Register(RefPtr<XRTTupleAllocation> tuple) {
+    while (true) {
+      int64 handle = MakeDeviceHandle(tuple->device_ordinal(), CreateUid());
+      mutex_lock lock(lock_);
+      allocs_.emplace_front(tuple);
+      if (alloc_map_.emplace(handle, allocs_.begin()).second) {
+        return handle;
+      }
+      // The chances of hitting an existing handle are so remote, it is much
+      // more convenient to add to the list before, and eventually removing.
+      allocs_.erase(allocs_.begin());
+    }
+  }
+
+  bool Release(int64 handle) {
+    mutex_lock lock(lock_);
+    auto it = alloc_map_.find(handle);
+    if (it == alloc_map_.end()) {
+      return false;
+    }
+    allocs_.erase(it->second);
+    alloc_map_.erase(it);
+    return true;
+  }
+
+  RefPtr<XRTTupleAllocation> Lookup(int64 handle) {
+    mutex_lock lock(lock_);
+    auto it = alloc_map_.find(handle);
+    if (it == alloc_map_.end()) {
+      return nullptr;
+    }
+    // LRU
+    allocs_.splice(allocs_.begin(), allocs_, it->second);
+    return it->second->tuple;
+  }
+
+  void Clear() {
+    mutex_lock lock(lock_);
+    alloc_map_.clear();
+    allocs_.clear();
+  }
+
+  Status CompactAllocations(XRTMemoryManager* memory_manager,
+                            xla::Backend* backend) {
+    VLOG(4) << "CompactAllocations started";
+    mutex_lock lock(lock_);
+    Status status;
+    std::vector<AllocList::iterator> swapped;
+    // We are swapping out from the most recently used allocations. This is
+    // desirable since the most recently used will be finding themselves at the
+    // bottom of the allocation space. Since these are more likely to be pinned
+    // allocations, a further trim done by following TryFreeMemory() call will
+    // eventually drop the higher located allocations, with better chance of
+    // reducing fragmentation.
+    // Also, by swapping out the pinned allocations first, those will also be
+    // the first to be restored, and hence if we will ever find OOM on the way
+    // out, we would more likely be swapping in not pinned ones.
+    for (auto it = allocs_.begin(); it != allocs_.end(); ++it) {
+      // We are compacting all the allocations, so we will temporarily swap out
+      // even pinned allocations.
+      auto swap_result_or = it->tuple->SwapOut(backend, /*swap_pinned=*/true);
+      if (!swap_result_or.ok()) {
+        status = swap_result_or.status();
+        break;
+      }
+      if (swap_result_or.ValueOrDie()) {
+        swapped.push_back(it);
+      }
+    }
+    // At this point we have released all the device memory we could release.
+    // Load back the tuple allocations we have swapped out above.
+    for (auto& it : swapped) {
+      auto swap_result_or = it->tuple->SwapIn(memory_manager, backend);
+      if (!swap_result_or.ok()) {
+        // If we failed to restored a pinned allocation, better to CHECK here
+        // than wondering why XRTTupleAllocation calls fail with errors about
+        // missing buffers.
+        CHECK(!it->tuple->IsPinned());  // Crash OK
+        if (status.ok()) {
+          status = swap_result_or.status();
+        }
+      }
+    }
+    VLOG(4) << "CompactAllocations finished: " << status;
+    return status;
+  }
+
+  // Tries to free size bytes by freeing some unpinned device memory. Returns
+  // the amount of memory which was able to free.
+  xla::StatusOr<size_t> TryFreeMemory(xla::Backend* backend, size_t size) {
+    mutex_lock lock(lock_);
+    size_t swapped_size = 0;
+    for (auto it = allocs_.rbegin(); it != allocs_.rend(); ++it) {
+      TF_ASSIGN_OR_RETURN(bool swap_result,
+                          it->tuple->SwapOut(backend, /*swap_pinned=*/false));
+      if (swap_result) {
+        swapped_size += it->tuple->GetDeviceMemorySize();
+        if (swapped_size >= size) {
+          break;
+        }
+      }
+    }
+    VLOG(3) << "Swapped out " << swapped_size << " bytes";
+    return swapped_size;
+  }
+
+ private:
+  static int64 CreateUid() {
+    int64 uid;
+    do {
+      uid = random::New64() & INT64_MAX;
+    } while (uid == InvalidKey());
+    return uid;
+  }
+
+  // We store Alloc records inside an std::list<Alloc> so we can LRU it, and
+  // store the list iterators within the handle map, as list iterators don't get
+  // invalidated by (other elements) removals or position swaps.
+  mutex lock_;
+  AllocList allocs_;
+  std::unordered_map<int64, AllocList::iterator> alloc_map_;
+};
+
+XRTMemoryManager::WorkingSet::WorkingSet(
+    RefPtr<XRTMemoryManager> memory_manager)
+    : memory_manager_(std::move(memory_manager)) {}
+
+XRTMemoryManager::WorkingSet::~WorkingSet() {
+  for (auto& tuple : pinned_tuples_) {
+    tuple->Unpin();
+  }
+}
+
+Status XRTMemoryManager::WorkingSet::LookupAndPin(xla::Backend* backend,
+                                                  int64 handle) {
+  TF_ASSIGN_OR_RETURN(auto tuple, memory_manager_->Lookup(handle));
+  TF_RETURN_IF_ERROR(
+      tuple->PinAndSwapIn(memory_manager_.get(), backend).status());
+  pinned_tuples_.push_back(std::move(tuple));
+  return Status::OK();
+}
+
+/* static */ RefPtr<XRTMemoryManager> XRTMemoryManager::Get(ResourceMgr* rm) {
+  static string* container = new string("XrtState");
+  static string* name = new string("MemoryManager");
+  XRTMemoryManager* memory_manager = nullptr;
+  TF_CHECK_OK(rm->LookupOrCreate<XRTMemoryManager>(
+      *container, *name, &memory_manager, [](XRTMemoryManager** ret) {
+        *ret = new XRTMemoryManager();
+        return Status::OK();
+      }));
+  return memory_manager;
+}
+
+int64 XRTMemoryManager::Register(RefPtr<XRTTupleAllocation> tuple) {
+  DeviceContext* device_context = GetDeviceContext(tuple->device_ordinal(),
+                                                   /*create_if_missing=*/true);
+  return device_context->Register(std::move(tuple));
+}
+
+xla::StatusOr<RefPtr<XRTTupleAllocation>> XRTMemoryManager::Lookup(
+    int64 handle) {
+  int device_ordinal = GetDeviceFromHandle(handle);
+  DeviceContext* device_context = GetDeviceContext(device_ordinal,
+                                                   /*create_if_missing=*/false);
+  if (device_context == nullptr) {
+    return errors::NotFound("XRT memory handle not found: ", handle);
+  }
+  RefPtr<XRTTupleAllocation> tuple = device_context->Lookup(handle);
+  if (tuple == nullptr) {
+    return errors::NotFound("XRT memory handle not found: ", handle);
+  }
+  return std::move(tuple);
+}
+
+Status XRTMemoryManager::Release(int64 handle) {
+  int device_ordinal = GetDeviceFromHandle(handle);
+  DeviceContext* device_context = GetDeviceContext(device_ordinal,
+                                                   /*create_if_missing=*/false);
+  if (device_context == nullptr || !device_context->Release(handle)) {
+    return errors::NotFound("XRT memory handle not found: ", handle);
+  }
+  return Status::OK();
+}
+
+Status XRTMemoryManager::CompactAllocations(xla::Backend* backend,
+                                            int device_ordinal) {
+  DeviceContext* device_context = GetDeviceContext(device_ordinal,
+                                                   /*create_if_missing=*/false);
+  return device_context != nullptr
+             ? device_context->CompactAllocations(this, backend)
+             : Status::OK();
+}
+
+void XRTMemoryManager::ReleaseAllAllocations() {
+  mutex_lock lock(lock_);
+  for (auto& device_context : device_contexts_) {
+    if (device_context != nullptr) {
+      device_context->Clear();
+    }
+  }
+}
+
+xla::StatusOr<se::OwningDeviceMemory> XRTMemoryManager::Allocate(
+    xla::Backend* backend, int device_ordinal, size_t size) {
+  se::DeviceMemoryAllocator* allocator = backend->memory_allocator();
+  auto memory_or =
+      allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false);
+  if (memory_or.status().code() == error::RESOURCE_EXHAUSTED) {
+    VLOG(4) << "Allocate of " << size << " bytes failed on device "
+            << device_ordinal;
+
+    DeviceContext* device_context =
+        GetDeviceContext(device_ordinal,
+                         /*create_if_missing=*/false);
+    if (device_context != nullptr) {
+      Status status = device_context->TryFreeMemory(backend, size).status();
+      if (status.ok()) {
+        // As long as there is no error, we still try again the allocation, even
+        // if the TryFreeMemory() call ended up freeing less memory than the
+        // required size. Fragmentation could make the memory allocation succeed
+        // even if the freed memory is indeed lower.
+        memory_or = allocator->Allocate(device_ordinal, size,
+                                        /*retry_on_failure=*/false);
+      } else if (status.code() != error::RESOURCE_EXHAUSTED) {
+        VLOG(4) << "Allocate of " << size << " bytes on device "
+                << device_ordinal << ": " << status;
+        return status;
+      }
+    }
+  }
+  return memory_or;
+}
+
+string XRTMemoryManager::DebugString() const {
+  // We might want to emit more detailed information here, like per device
+  // memory allocations.
+  return "XRTMemoryManager";
+}
+
+XRTMemoryManager::DeviceContext* XRTMemoryManager::GetDeviceContext(
+    int device_ordinal, bool create_if_missing) {
+  mutex_lock lock(lock_);
+  if (device_ordinal >= device_contexts_.size()) {
+    if (!create_if_missing) {
+      return nullptr;
+    }
+    device_contexts_.resize(device_ordinal + 1);
+  }
+  DeviceContext* device_context = device_contexts_[device_ordinal].get();
+  if (device_context == nullptr && create_if_missing) {
+    device_contexts_[device_ordinal] = absl::make_unique<DeviceContext>();
+    device_context = device_contexts_[device_ordinal].get();
+  }
+  return device_context;
+}
+
+Status XRTMemoryManager::TryFreeMemoryStep(MemoryReclaimContext* mrctx,
+                                           const Status& status) {
+  DeviceContext* device_context = GetDeviceContext(mrctx->device_ordinal,
+                                                   /*create_if_missing=*/false);
+  if (device_context == nullptr) {
+    return status;
+  }
+  if (!mrctx->done_freeing) {
+    // If the caller passed us a zero requested_free_size, we try to free chunks
+    // of kMaxFreeSize memory, until either the run function suceeds, or we run
+    // out of freeable memory.
+    const size_t kMaxFreeSize = 1000000000;
+    size_t free_size =
+        (mrctx->requested_free_size > 0)
+            ? std::min<size_t>(mrctx->requested_free_size - mrctx->free_size,
+                               kMaxFreeSize)
+            : kMaxFreeSize;
+    if (free_size > 0) {
+      auto free_size_or =
+          device_context->TryFreeMemory(mrctx->backend, free_size);
+      if (!free_size_or.ok()) {
+        return status;
+      }
+      size_t size = free_size_or.ValueOrDie();
+      mrctx->free_size += size;
+      if (size > 0) {
+        return Status::OK();
+      }
+    }
+    mrctx->done_freeing = true;
+  }
+  if (!mrctx->done_compacting) {
+    mrctx->done_compacting = true;
+    if (device_context->CompactAllocations(this, mrctx->backend).ok()) {
+      return Status::OK();
+    }
+  }
+  return status;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_memory_manager.h b/tensorflow/compiler/xrt/xrt_memory_manager.h
new file mode 100644
index 00000000000..445be45cf57
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_memory_manager.h
@@ -0,0 +1,177 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XRT_XRT_MEMORY_MANAGER_H_
+#define TENSORFLOW_COMPILER_XRT_XRT_MEMORY_MANAGER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/compiler/xrt/xrt_refptr.h"
+#include "tensorflow/compiler/xrt/xrt_state.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace tensorflow {
+
+// The XRTMemoryManager manages all the XRT allocations. It is a ResourceBase
+// object which leaves within the ResourceMgr. This is only one XRT memory
+// manager object within the ResourceMgr container.
+class XRTMemoryManager : public ResourceBase {
+  // The DeviceContext class, defined and implemented locally inside the
+  // xrt_memory_manager.cc file, holds, for each device, all the information
+  // related to the XRT memory management for such device.
+  class DeviceContext;
+
+ public:
+  // A working set is a set of tuple allocations which are the input of a given
+  // operation, and as such they must be pinned on the device memory. The tuple
+  // allocations added to the WorkingSet will be unpinned at object destruction.
+  class WorkingSet {
+   public:
+    explicit WorkingSet(RefPtr<XRTMemoryManager> memory_manager);
+
+    ~WorkingSet();
+
+    // Looks up the tuple handle within the memory manager, and pins it to the
+    // device (if not already pinned).
+    Status LookupAndPin(xla::Backend* backend, int64 handle);
+
+    const std::vector<RefPtr<XRTTupleAllocation>>& PinnedTuples() const {
+      return pinned_tuples_;
+    }
+
+    const RefPtr<XRTMemoryManager>& MemoryManager() const {
+      return memory_manager_;
+    }
+
+   private:
+    RefPtr<XRTMemoryManager> memory_manager_;
+    std::vector<RefPtr<XRTTupleAllocation>> pinned_tuples_;
+  };
+
+  // Retrieves the XRTMemoryManager singleton stored within the ResourceMgr.
+  static RefPtr<XRTMemoryManager> Get(ResourceMgr* rm);
+
+  // Registers an XRTTupleAllocation and returns the unique handle identifying
+  // it.
+  int64 Register(RefPtr<XRTTupleAllocation> tuple);
+
+  // Looks up an handle returned by the Register() API and returns the
+  // XRTTupleAllocation behind it.
+  xla::StatusOr<RefPtr<XRTTupleAllocation>> Lookup(int64 handle);
+
+  Status Lookup(int64 handle, RefPtr<XRTTupleAllocation>* tuple) {
+    TF_ASSIGN_OR_RETURN(*tuple, Lookup(handle));
+    return Status::OK();
+  }
+
+  // Releases an handle by dropping the refences count held on the
+  // XRTTupleAllocation by the XRTMemoryManager. Existing XRTTupleAllocation
+  // references will continue to be valid.
+  Status Release(int64 handle);
+
+  // Tries to compact all the memory allocations on a given device. This is
+  // currently done by swapping-out all the existing allocation, and swapping
+  // them back in.
+  Status CompactAllocations(xla::Backend* backend, int device_ordinal);
+
+  // Releases all the device memory allocated by XRT within the resource
+  // manager.
+  void ReleaseAllAllocations();
+
+  // Tries to allocate size bytes of device memory from the device_ordinal
+  // device. Might attempt to free some unpinned device memory, if the underline
+  // allocator call fails, and try the allocation again.
+  xla::StatusOr<se::OwningDeviceMemory> Allocate(xla::Backend* backend,
+                                                 int device_ordinal,
+                                                 size_t size);
+
+  // Runs the specified function and handling the error::RESOURCE_EXHAUSTED
+  // status code coming out of it. In such cases, we run different memory
+  // freeing operations trying to make runfn succeed. The requested_free_size
+  // argument represents an hint of the requested memory size which would make
+  // runfn succeed.
+  template <typename T>
+  xla::StatusOr<T> Run(const std::function<xla::StatusOr<T>()>& runfn,
+                       xla::Backend* backend, int device_ordinal,
+                       size_t requested_free_size);
+
+  string DebugString() const override;
+
+  // Returns the invalid key value, which will be never generated by the
+  // Intern() API.
+  static int64 InvalidKey() { return 0; }
+
+ private:
+  // Structure used to track the progress of a try-to-free operation. It is
+  // initialized and the passed to the TryFreeMemoryStep() API.
+  struct MemoryReclaimContext {
+    MemoryReclaimContext(xla::Backend* backend, int device_ordinal,
+                         size_t requested_free_size)
+        : backend(backend),
+          device_ordinal(device_ordinal),
+          requested_free_size(requested_free_size) {}
+
+    xla::Backend* const backend = nullptr;
+    const int device_ordinal = 0;
+    const size_t requested_free_size = 0;
+    size_t free_size = 0;
+    bool done_freeing = false;
+    bool done_compacting = false;
+  };
+
+  DeviceContext* GetDeviceContext(int device_ordinal, bool create_if_missing);
+
+  // Called multiple times while trying to make a memory consuming function call
+  // to fit. Performs progressively more expensive memory reduction operations,
+  // until returning error::RESOURCE_EXHAUSTED when no further reductions are
+  // possible.
+  Status TryFreeMemoryStep(MemoryReclaimContext* mrctx, const Status& status);
+
+  mutex lock_;
+  std::vector<std::unique_ptr<DeviceContext>> device_contexts_;
+};
+
+template <typename T>
+xla::StatusOr<T> XRTMemoryManager::Run(
+    const std::function<xla::StatusOr<T>()>& runfn, xla::Backend* backend,
+    int device_ordinal, size_t requested_free_size) {
+  MemoryReclaimContext mrctx(backend, device_ordinal, requested_free_size);
+  while (true) {
+    // We assume that runfn is a relatively fast-fail function compared to the
+    // operations required to free up the required memory. Here we call into the
+    // TryFreeMemoryStep() API multiple times, which will run progressively more
+    // expensive operations.
+    auto result_or = runfn();
+    if (result_or.status().code() != error::RESOURCE_EXHAUSTED) {
+      return result_or;
+    }
+    TF_RETURN_IF_ERROR(TryFreeMemoryStep(&mrctx, result_or.status()));
+  }
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_XRT_XRT_MEMORY_MANAGER_H_
diff --git a/tensorflow/compiler/xrt/xrt_refptr.h b/tensorflow/compiler/xrt/xrt_refptr.h
new file mode 100644
index 00000000000..2db20dd71ce
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_refptr.h
@@ -0,0 +1,108 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utility functions in support of the XRT API.
+
+#ifndef TENSORFLOW_COMPILER_XRT_XRT_REFPTR_H_
+#define TENSORFLOW_COMPILER_XRT_XRT_REFPTR_H_
+
+#include <cstddef>
+
+namespace tensorflow {
+
+// Reference counted smart pointer for XRT objects providing the standard
+// Ref()/Unref() APIs.
+template <typename T>
+class RefPtr {
+ public:
+  RefPtr() = default;
+  // Creates a RefPtr from a pointer. This is an ownership transfer operation,
+  // and the caller has to own a valid reference to ptr (unless ptr is nullptr).
+  RefPtr(T* ptr) : ptr_(ptr) {}  // NOLINT
+  RefPtr(const RefPtr& other) : ptr_(other.ptr_) { Acquire(ptr_); }
+  RefPtr(RefPtr&& other) : ptr_(other.ptr_) { other.ptr_ = nullptr; }
+
+  ~RefPtr() { Release(ptr_); }
+
+  RefPtr& operator=(const RefPtr& other) {
+    if (this != &other) {
+      Acquire(other.ptr_);
+      Release(ptr_);
+      ptr_ = other.ptr_;
+    }
+    return *this;
+  }
+
+  RefPtr& operator=(RefPtr&& other) {
+    if (this != &other) {
+      Release(ptr_);
+      ptr_ = other.ptr_;
+      other.ptr_ = nullptr;
+    }
+    return *this;
+  }
+
+  operator bool() const { return ptr_ != nullptr; }  // NOLINT
+  bool operator==(const RefPtr& rhs) const { return ptr_ == rhs.ptr_; }
+  bool operator!=(const RefPtr& rhs) const { return ptr_ != rhs.ptr_; }
+  bool operator==(const T* ptr) const { return ptr_ == ptr; }
+  bool operator!=(const T* ptr) const { return ptr_ != ptr; }
+  bool operator==(std::nullptr_t ptr) const { return ptr_ == ptr; }
+  bool operator!=(std::nullptr_t ptr) const { return ptr_ != ptr; }
+
+  T* get() const { return ptr_; }
+
+  T* operator->() const {
+    CHECK(ptr_ != nullptr);  // Crash OK
+    return ptr_;
+  }
+
+  T& operator*() const {
+    CHECK(ptr_ != nullptr);  // Crash OK
+    return *ptr_;
+  }
+
+  T* release() {
+    T* ptr = ptr_;
+    ptr_ = nullptr;
+    return ptr;
+  }
+
+  // Resets the RefPtr from a pointer. This is an ownership transfer operation,
+  // and the caller has to own a valid reference to ptr (unless ptr is nullptr).
+  void reset(T* ptr = nullptr) {
+    Release(ptr_);
+    ptr_ = ptr;
+  }
+
+ private:
+  static void Release(T* ptr) {
+    if (ptr != nullptr) {
+      ptr->Unref();
+    }
+  }
+
+  static void Acquire(T* ptr) {
+    if (ptr != nullptr) {
+      ptr->Ref();
+    }
+  }
+
+  T* ptr_ = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_XRT_XRT_REFPTR_H_
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index fa25b727a3d..2f5eb5aec1e 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -18,31 +18,24 @@ limitations under the License.
 
 #include "tensorflow/compiler/xrt/xrt_state.h"
 
-#include <stdint.h>
-
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
 
 #include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/stream_executor/stream_executor.h"
+#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
 
 namespace tensorflow {
-
 namespace {
 
+// Helper typedef to make ShapeTree ForEach helper lambda signatures more
+// readable. They need a type of const T& where in this case T is the
+// following pointer.
+typedef XRTBufferAllocation* XRTBufferAllocationPtr;
+
 class BufferAllocStats {
  public:
   struct Stats {
@@ -71,26 +64,15 @@ class BufferAllocStats {
   std::map<int64, Stats> stats_;
 };
 
-const char* kTupleContainer = "tuples";
-
-int64 get_uid() {
-  int64 uid;
-  do {
-    uid = random::New64() & INT64_MAX;
-  } while (uid == XRTTupleAllocation::InvalidKey());
-  return uid;
-}
-
 BufferAllocStats* GetAllocStats() {
   static BufferAllocStats* stats = new BufferAllocStats();
   return stats;
 }
 
 Status AllocateScopedShapedBuffer(
-    xla::Backend* backend, int device_ordinal, const xla::Shape& shape,
-    std::unique_ptr<xla::ScopedShapedBuffer>* buffer) {
+    XRTMemoryManager* memory_manager, xla::Backend* backend, int device_ordinal,
+    const xla::Shape& shape, std::unique_ptr<xla::ScopedShapedBuffer>* buffer) {
   auto transfer_manager = backend->transfer_manager();
-  auto allocator = backend->memory_allocator();
   TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
 
   // XLA may use a different representation on device than the representation on
@@ -111,18 +93,19 @@ Status AllocateScopedShapedBuffer(
   // it goes out of scope. That's useful if we return early as the result of an
   // error allocating one of the later buffers.
   *buffer = absl::make_unique<xla::ScopedShapedBuffer>(
-      shape, on_device_shape, allocator, device_ordinal);
+      shape, on_device_shape, backend->memory_allocator(), device_ordinal);
   for (auto& index_to_buffer : (*buffer)->buffers()) {
-    xla::Shape subshape =
+    const xla::Shape& subshape =
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
     TF_ASSIGN_OR_RETURN(
         se::OwningDeviceMemory buffer,
-        allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false));
+        memory_manager->Allocate(backend, device_ordinal, size));
     // Move our buffer into shaped_buffer, which takes ownership of it.
     index_to_buffer.second = buffer.Release();
     VLOG(2) << "Allocated buffer at " << index_to_buffer.second.opaque()
-            << " index " << index_to_buffer.first.ToString();
+            << " index " << index_to_buffer.first.ToString() << " (" << size
+            << " bytes)";
   }
 
   TF_RETURN_IF_ERROR(
@@ -136,8 +119,7 @@ Status AllocateScopedShapedBuffer(
 XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                          int device_ordinal,
                                          se::DeviceMemoryAllocator* allocator)
-    : size_(allocation.size()),
-      allocation_(allocation),
+    : allocation_(allocation),
       device_ordinal_(device_ordinal),
       allocator_(allocator) {
   if (VLOG_IS_ON(2)) {
@@ -153,21 +135,15 @@ XRTBufferAllocation::~XRTBufferAllocation() {
     GetAllocStats()->ReportFree(device_ordinal_, allocation_.size());
   }
   // Deallocate explicitly allows allocation_ to be null.
-  Status s = allocator_->Deallocate(device_ordinal_, allocation_);
-  // Nothing to do but check fail here if memory datastructures are corrupted.
-  CHECK(s.ok());
-  VLOG(2) << "Freed buffer at " << allocation_.opaque();
+  TF_CHECK_OK(allocator_->Deallocate(device_ordinal_, allocation_));
+  VLOG(2) << "Freed buffer at " << allocation_.opaque() << " ("
+          << allocation_.size() << " bytes)";
 }
 
 const se::DeviceMemoryBase& XRTBufferAllocation::allocation() {
   return allocation_;
 }
 
-void XRTBufferAllocation::DiscardAllocation() {
-  // Replace the allocation with a null.
-  allocation_ = se::DeviceMemoryBase();
-}
-
 XRTTupleAllocation::XRTTupleAllocation(int device_ordinal,
                                        se::DeviceMemoryAllocator* allocator,
                                        const xla::Shape& on_host_shape,
@@ -176,23 +152,29 @@ XRTTupleAllocation::XRTTupleAllocation(int device_ordinal,
       allocator_(allocator),
       on_host_shape_(on_host_shape),
       on_device_shape_(on_device_shape),
-      buffers_(&on_device_shape_) {}
+      buffers_(&on_device_shape_),
+      pin_count_(0) {}
 
-XRTTupleAllocation::~XRTTupleAllocation() {
-  for (auto& buffer : buffers_) {
-    buffer.second->Unref();
+XRTTupleAllocation::~XRTTupleAllocation() { ReleaseBuffers(); }
+
+void XRTTupleAllocation::ReleaseBuffers() {
+  for (auto& index_buffer : buffers_) {
+    if (index_buffer.second != nullptr) {
+      index_buffer.second->Unref();
+      index_buffer.second = nullptr;
+    }
   }
 }
 
 /*static*/ Status XRTTupleAllocation::CreateAndTransfer(
-    const xla::LiteralBase& literal, xla::Backend* backend, int device_ordinal,
+    const xla::LiteralBase& literal, XRTMemoryManager* memory_manager,
+    xla::Backend* backend, int device_ordinal,
     XRTTupleAllocation** allocation) {
   auto transfer_manager = backend->transfer_manager();
-  auto allocator = backend->memory_allocator();
-
   std::unique_ptr<xla::ScopedShapedBuffer> scoped_buffer;
-  TF_RETURN_IF_ERROR(AllocateScopedShapedBuffer(
-      backend, device_ordinal, literal.shape(), &scoped_buffer));
+  TF_RETURN_IF_ERROR(AllocateScopedShapedBuffer(memory_manager, backend,
+                                                device_ordinal, literal.shape(),
+                                                &scoped_buffer));
   TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
   TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
       stream.get(), literal, *scoped_buffer));
@@ -202,11 +184,13 @@ XRTTupleAllocation::~XRTTupleAllocation() {
   // call. To avoid a leak, there must be no error-case returns from here until
   // the end of the method.
   auto shaped_buffer = scoped_buffer->release();
-  *allocation = new XRTTupleAllocation(device_ordinal, allocator,
-                                       shaped_buffer.on_host_shape(),
-                                       shaped_buffer.on_device_shape());
+  *allocation = new XRTTupleAllocation(
+      device_ordinal, backend->memory_allocator(),
+      shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape());
   (*allocation)
-      ->InitializeFromShapedBuffer(shaped_buffer, allocator, device_ordinal);
+      ->InitializeFromShapedBuffer(shaped_buffer, backend->memory_allocator(),
+                                   device_ordinal);
+  (*allocation)->SetDeviceMemorySize();
   return Status::OK();
 }
 
@@ -220,24 +204,22 @@ XRTTupleAllocation::~XRTTupleAllocation() {
                                        shaped_buffer.on_device_shape());
   (*allocation)
       ->InitializeFromShapedBuffer(shaped_buffer, allocator, device_ordinal);
+  (*allocation)->SetDeviceMemorySize();
   return Status::OK();
 }
 
-Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
+Status XRTTupleAllocation::ToLiteral(xla::Backend* backend,
                                      xla::MutableLiteralBase* literal) {
-  auto transfer_manager = backend->transfer_manager();
-  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
+  mutex_lock lock(lock_);
+  return literal_ == nullptr ? StoreToLiteral(backend, literal)
+                             : literal->CopyFrom(*literal_);
+}
 
-  // Validate the allocation buffers as if nulls gets to
-  // TransferLiteralFromDevice() a CHECK is issued.
-  xla::ShapedBuffer shaped_buffer = ToShapedBuffer();
-  for (auto& index_buffer : shaped_buffer.buffers()) {
-    if (index_buffer.second.is_null()) {
-      return errors::InvalidArgument("Literal buffer at index ",
-                                     index_buffer.first.ToString(),
-                                     " has been released");
-    }
-  }
+Status XRTTupleAllocation::StoreToLiteral(xla::Backend* backend,
+                                          xla::MutableLiteralBase* literal) {
+  auto transfer_manager = backend->transfer_manager();
+  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
+  TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, ToShapedBuffer());
   return transfer_manager->TransferLiteralFromDevice(stream.get(),
                                                      shaped_buffer, *literal);
 }
@@ -250,52 +232,102 @@ Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
         xla::ShapeUtil::HumanStringWithLayout(literal.shape()),
         " device=", xla::ShapeUtil::HumanStringWithLayout(on_host_shape()));
   }
+  mutex_lock lock(lock_);
+  if (literal_ != nullptr) {
+    // The allocation is currently swapped out, and we have a host literal for
+    // its content. Just update the host literal with the new value.
+    return literal_->CopyFrom(literal);
+  }
+  TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer, ToShapedBuffer());
   auto transfer_manager = backend->transfer_manager();
   TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
   return transfer_manager->TransferLiteralToDevice(stream.get(), literal,
-                                                   ToShapedBuffer());
+                                                   shaped_buffer);
 }
 
+xla::StatusOr<bool> XRTTupleAllocation::SwapOut(xla::Backend* backend,
+                                                bool swap_pinned) {
+  mutex_lock lock(lock_);
+  if (literal_ == nullptr && (!IsPinned() || swap_pinned)) {
+    xla::Literal literal(on_host_shape());
+    TF_RETURN_IF_ERROR(StoreToLiteral(backend, &literal));
+    ReleaseBuffers();
+    literal_ = absl::make_unique<xla::Literal>(std::move(literal));
+    return true;
+  }
+  return false;
+}
+
+xla::StatusOr<bool> XRTTupleAllocation::SwapIn(XRTMemoryManager* memory_manager,
+                                               xla::Backend* backend) {
+  // We need to call AllocateScopedShapedBuffer() outside the locks, since the
+  // XRTMemoryManager might end up calling back into the SwapOut() API.
+  // So we do a quick check before using the IsSwapped() API, and it can happen
+  // that the allocation becomes swapped in after the check. This means which we
+  // will end up doing an allocation, and then releasing it soon after (via its
+  // scoped variables). This is an unlikely scenario (two threads calling
+  // SwapIn() on the same allocation) though.
+  if (!IsSwapped()) {
+    return false;
+  }
+
+  auto transfer_manager = backend->transfer_manager();
+  std::unique_ptr<xla::ScopedShapedBuffer> scoped_buffer;
+  TF_RETURN_IF_ERROR(
+      AllocateScopedShapedBuffer(memory_manager, backend, device_ordinal(),
+                                 on_host_shape(), &scoped_buffer));
+  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
+
+  mutex_lock lock(lock_);
+  if (literal_ != nullptr) {
+    TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
+        stream.get(), *literal_, *scoped_buffer));
+
+    auto shaped_buffer = scoped_buffer->release();
+    InitializeFromShapedBuffer(shaped_buffer, backend->memory_allocator(),
+                               device_ordinal());
+    literal_ = nullptr;
+    return true;
+  }
+  return false;
+}
+
+xla::StatusOr<bool> XRTTupleAllocation::PinAndSwapIn(
+    XRTMemoryManager* memory_manager, xla::Backend* backend) {
+  Pin();
+  return SwapIn(memory_manager, backend);
+}
+
+bool XRTTupleAllocation::IsSwapped() const {
+  mutex_lock lock(lock_);
+  return literal_ != nullptr;
+}
+
+int64 XRTTupleAllocation::Pin() { return pin_count_.fetch_add(1); }
+
+int64 XRTTupleAllocation::Unpin() { return pin_count_.fetch_sub(1); }
+
+bool XRTTupleAllocation::IsPinned() const { return pin_count_ != 0; }
+
 void XRTTupleAllocation::DiscardAllocation(
     const xla::ShapeIndex& buffer_index) {
   buffers_.element(buffer_index)->DiscardAllocation();
 }
 
-const xla::Shape& XRTTupleAllocation::on_host_shape() { return on_host_shape_; }
+const xla::Shape& XRTTupleAllocation::on_host_shape() const {
+  return on_host_shape_;
+}
 
-const xla::Shape& XRTTupleAllocation::on_device_shape() {
+const xla::Shape& XRTTupleAllocation::on_device_shape() const {
   return on_device_shape_;
 }
 
-int XRTTupleAllocation::device_ordinal() { return device_ordinal_; }
+int XRTTupleAllocation::device_ordinal() const { return device_ordinal_; }
 
-const se::DeviceMemoryBase& XRTTupleAllocation::root_allocation() {
+const se::DeviceMemoryBase& XRTTupleAllocation::root_allocation() const {
   return buffers_.element({})->allocation();
 }
 
-/*static*/ Status XRTTupleAllocation::Lookup(ResourceMgr* rm, int64 key,
-                                             XRTTupleAllocation** allocation) {
-  string key_string = absl::StrCat(key);
-  TF_RETURN_IF_ERROR(rm->Lookup(kTupleContainer, key_string, allocation));
-  return Status::OK();
-}
-
-/*static*/ Status XRTTupleAllocation::DeleteFromResourceManager(ResourceMgr* rm,
-                                                                int64 key) {
-  string key_string = absl::StrCat(key);
-  return rm->Delete<XRTTupleAllocation>(kTupleContainer, key_string);
-}
-
-/* static */ Status XRTTupleAllocation::ReleaseAllAllocations(ResourceMgr* rm) {
-  VLOG(1) << "Releasing all XRT held device memory";
-  return rm->Cleanup(kTupleContainer);
-}
-
-// Helper typedef to make ShapeTree ForEach helper lambda signatures more
-// readable. They need a type of const T& where in this case T is the
-// following pointer.
-typedef XRTBufferAllocation* XRTBufferAllocationPtr;
-
 /*static*/ Status XRTTupleAllocation::MakeSubBuffer(
     XRTTupleAllocation* parent, const xla::ShapeIndex& subshape,
     XRTTupleAllocation** allocation, bool alias_parent_allocation) {
@@ -330,46 +362,21 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
                 parent_index.push_back(index[i]);
               }
               *buffer = parent->buffers_.element(parent_index);
-              *parent->buffers_.mutable_element(parent_index) =
-                  new XRTBufferAllocation(se::DeviceMemoryBase(),
-                                          parent->device_ordinal(),
-                                          parent->allocator_);
+              *parent->buffers_.mutable_element(parent_index) = nullptr;
             });
   }
-
+  (*allocation)->SetDeviceMemorySize();
   return Status::OK();
 }
 
-/* static */ Status XRTTupleAllocation::CompactAllocations(
-    ResourceMgr* rm, xla::Backend* backend, int device_ordinal) {
-  std::vector<ResourceMgr::ResourceEntry> tuples;
-  rm->GetContainerResources(kTupleContainer, &tuples);
-
-  std::vector<std::pair<string, xla::Literal>> host_tuples;
-  for (auto& rm_tuple : tuples) {
-    XRTTupleAllocation* tuple =
-        dynamic_cast<XRTTupleAllocation*>(rm_tuple.resource.get());
-    if (tuple->device_ordinal() == device_ordinal) {
-      xla::Literal literal(tuple->on_host_shape());
-      TF_RETURN_IF_ERROR(tuple->ToLiteral(backend, device_ordinal, &literal));
-      host_tuples.emplace_back(rm_tuple.name, std::move(literal));
-      // At this point there are two references held onto the XRTTupleAllocation
-      // object. One in the ResourceMgr, which we release here, and one held
-      // within the tuples vector, which we release in the tuples.clear() call
-      // below.
-      TF_RETURN_IF_ERROR(
-          rm->Delete<XRTTupleAllocation>(kTupleContainer, rm_tuple.name));
+void XRTTupleAllocation::SetDeviceMemorySize() {
+  size_t size = 0;
+  for (auto& index_buffer : buffers_) {
+    if (index_buffer.second != nullptr) {
+      size += index_buffer.second->allocation().size();
     }
   }
-  tuples.clear();
-
-  for (auto& name_literal : host_tuples) {
-    XRTTupleAllocation* tuple;
-    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateAndTransfer(
-        name_literal.second, backend, device_ordinal, &tuple));
-    TF_RETURN_IF_ERROR(rm->Create(kTupleContainer, name_literal.first, tuple));
-  }
-  return Status::OK();
+  device_memory_size_ = size;
 }
 
 /* static */ Status XRTTupleAllocation::ExpandTreeOfTuples(
@@ -414,7 +421,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
 }
 
 /*static*/ Status XRTTupleAllocation::MakeTuple(
-    xla::Backend* backend, int device_ordinal,
+    XRTMemoryManager* memory_manager, xla::Backend* backend, int device_ordinal,
     const xla::ShapeTree<ExpandedTupleInput>& elements,
     XRTTupleAllocation** allocation) {
   auto transfer_manager = backend->transfer_manager();
@@ -429,8 +436,8 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
   // The aliasing is determined below based on whether or not all the inputs are
   // released while being transferred. allocation_tmp is a local pointer that is
   // copied to *allocation at the end only if the method succeeds.
-  auto allocation_tmp = new XRTTupleAllocation(device_ordinal, allocator,
-                                               host_shape, device_shape);
+  XRTTupleAllocation* allocation_tmp = new XRTTupleAllocation(
+      device_ordinal, allocator, host_shape, device_shape);
   core::ScopedUnref allocation_unref(allocation_tmp);
   // First allocate device memory for the new tuple index tables, one at each
   // internal node of the elements tree. Do this in a separate pass into a
@@ -444,12 +451,12 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
   TF_RETURN_IF_ERROR(elements.ForEachElementWithStatus(
       [&](const xla::ShapeIndex& index, const ExpandedTupleInput& element) {
         if (!elements.IsLeaf(index)) {
-          xla::Shape subshape =
+          const xla::Shape& subshape =
               xla::ShapeUtil::GetSubshape(device_shape, index);
           uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
-          TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
-                              allocator->Allocate(device_ordinal, size,
-                                                  /*retry_on_failure=*/false));
+          TF_ASSIGN_OR_RETURN(
+              se::OwningDeviceMemory buffer,
+              memory_manager->Allocate(backend, device_ordinal, size));
           VLOG(2) << "Allocated buffer at " << buffer->opaque() << " index "
                   << index.ToString();
           // Move the new buffer into new_tuple_buffers, which takes ownership
@@ -487,10 +494,8 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
         // validated that release_allocation_after_use is false if
         // element.allocation appears in more than one leaf.
         element.allocation->buffers_.ForEachMutableElement(
-            [&](const xla::ShapeIndex& index, XRTBufferAllocationPtr* buffer) {
-              *buffer = new XRTBufferAllocation(
-                  se::DeviceMemoryBase(), element.allocation->device_ordinal(),
-                  element.allocation->allocator_);
+            [&](const xla::ShapeIndex&, XRTBufferAllocationPtr* buffer) {
+              *buffer = nullptr;
             });
       } else {
         // Increment the refcount on each newly-aliased buffer.
@@ -506,6 +511,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
                                   allocator);
     }
   });
+  allocation_tmp->SetDeviceMemorySize();
   // Because the internal nodes of tuple_buffers are exactly the new index
   // tables, WriteTupleIndexTables will write only the new index tables and not
   // rewrite the index tables for the existing allocations.
@@ -519,36 +525,47 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
   return Status::OK();
 }
 
-Status XRTTupleAllocation::Intern(ResourceMgr* rm, int64* key) {
-  *key = get_uid();
-  string key_string = absl::StrCat(*key);
-  return rm->Create(kTupleContainer, key_string, this);
-}
-
-bool XRTTupleAllocation::IsExclusiveOwner() {
-  for (const auto& buffer : buffers_) {
-    if (!buffer.second->RefCountIsOne()) return false;
+bool XRTTupleAllocation::IsExclusiveOwner() const {
+  for (const auto& index_buffer : buffers_) {
+    if (index_buffer.second != nullptr &&
+        !index_buffer.second->RefCountIsOne()) {
+      return false;
+    }
   }
   return true;
 }
 
+size_t XRTTupleAllocation::GetDeviceMemorySize() const {
+  return device_memory_size_;
+}
+
 void XRTTupleAllocation::InitializeFromShapedBuffer(
     const xla::ShapedBuffer& shaped_buffer,
     se::DeviceMemoryAllocator* allocator, int device_ordinal) {
-  for (auto& buffer : buffers_) {
+  for (auto& index_buffer : buffers_) {
+    if (index_buffer.second != nullptr) {
+      index_buffer.second->Unref();
+    }
     // Make a reference-counted version of the allocated buffer.
-    buffer.second = new XRTBufferAllocation(shaped_buffer.buffer(buffer.first),
-                                            device_ordinal, allocator);
+    index_buffer.second = new XRTBufferAllocation(
+        shaped_buffer.buffer(index_buffer.first), device_ordinal, allocator);
   }
 }
 
-xla::ShapedBuffer XRTTupleAllocation::ToShapedBuffer() {
+xla::StatusOr<xla::ShapedBuffer> XRTTupleAllocation::ToShapedBuffer() {
   xla::ShapedBuffer shaped_buffer(on_host_shape(), on_device_shape(),
                                   allocator_->platform(), device_ordinal_);
-  for (const auto& buffer : buffers_) {
-    shaped_buffer.set_buffer(buffer.second->allocation(), buffer.first);
+  for (const auto& index_buffer : buffers_) {
+    if (index_buffer.second == nullptr ||
+        index_buffer.second->allocation().is_null()) {
+      return errors::InvalidArgument("Literal buffer at index ",
+                                     index_buffer.first.ToString(),
+                                     " has been released");
+    }
+    shaped_buffer.set_buffer(index_buffer.second->allocation(),
+                             index_buffer.first);
   }
-  return shaped_buffer;
+  return std::move(shaped_buffer);
 }
 
 Status XRTTupleAllocation::AliasBufferFrom(const XRTTupleAllocation& source,
@@ -556,37 +573,69 @@ Status XRTTupleAllocation::AliasBufferFrom(const XRTTupleAllocation& source,
                                            const xla::ShapeIndex& dest_index) {
   XRTBufferAllocation* source_buffer = source.buffers_.element(source_index);
   XRTBufferAllocation* dest_buffer = buffers_.element(dest_index);
-  // We allow the destination size being zero, because there are cases where we
-  // are coming in later filling in null/uninitialized device buffers.
-  // In all other cases, the size of the new buffer must match.
-  if (source_buffer->size() != dest_buffer->size() &&
-      dest_buffer->size() != 0) {
-    return errors::InvalidArgument(
-        "Source buffer at index ", source_index.ToString(),
-        " does not match the size of destination buffer at index ",
-        dest_index.ToString(), ": ", source_buffer->size(), " vs ",
-        dest_buffer->size());
+  if (dest_buffer != nullptr) {
+    // We allow the destination size being zero, because there are cases where
+    // we are coming in later filling in null/uninitialized device buffers. In
+    // all other cases, the size of the new buffer must match.
+    if (source_buffer->allocation().size() !=
+            dest_buffer->allocation().size() &&
+        dest_buffer->allocation().size() != 0) {
+      return errors::InvalidArgument(
+          "Source buffer at index ", source_index.ToString(),
+          " does not match the size of destination buffer at index ",
+          dest_index.ToString(), ": ", source_buffer->allocation().size(),
+          " vs ", dest_buffer->allocation().size());
+    }
+  } else {
+    const xla::Shape& source_subshape =
+        xla::ShapeUtil::GetSubshape(source.on_device_shape(), source_index);
+    const xla::Shape& dest_subshape =
+        xla::ShapeUtil::GetSubshape(on_device_shape(), dest_index);
+    if (!xla::ShapeUtil::Equal(source_subshape, dest_subshape)) {
+      return errors::InvalidArgument(
+          "Source and destination subshapes do not match: source=",
+          xla::ShapeUtil::HumanStringWithLayout(source_subshape),
+          " dest=", xla::ShapeUtil::HumanStringWithLayout(dest_subshape));
+    }
   }
   *buffers_.mutable_element(dest_index) = source_buffer;
   source_buffer->Ref();
-  dest_buffer->Unref();
+  if (dest_buffer != nullptr) {
+    // If we handed over the ownership of a buffer in ToDeviceMemoryTree(), we
+    // will be called here on the way back from execution, to alias back the
+    // buffer at that index. In that case the buffers will be the same. So we
+    // need to discard the memory at the destination buffer, before releasing
+    // the reference.
+    if (dest_buffer->allocation().IsSameAs(source_buffer->allocation()) &&
+        dest_buffer != source_buffer) {
+      dest_buffer->DiscardAllocation();
+    }
+    dest_buffer->Unref();
+  }
   return Status::OK();
 }
 
-xla::ShapeTree<xla::MaybeOwningDeviceMemory>
+xla::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>>
 XRTTupleAllocation::ToDeviceMemoryTree(
     const std::function<bool(const xla::ShapeIndex&)>& release_checker) {
   xla::ShapeTree<xla::MaybeOwningDeviceMemory> shaped_tree(on_device_shape());
-  for (const auto& buffer : buffers_) {
-    if (!release_checker(buffer.first)) {
-      *shaped_tree.mutable_element(buffer.first) = buffer.second->allocation();
+  for (const auto& index_buffer : buffers_) {
+    if (index_buffer.second == nullptr ||
+        index_buffer.second->allocation().is_null()) {
+      return errors::InvalidArgument("Literal buffer at index ",
+                                     index_buffer.first.ToString(),
+                                     " has been released");
+    }
+    if (!release_checker(index_buffer.first)) {
+      *shaped_tree.mutable_element(index_buffer.first) =
+          index_buffer.second->allocation();
     } else {
-      *shaped_tree.mutable_element(buffer.first) = se::OwningDeviceMemory(
-          buffer.second->allocation(), device_ordinal_, allocator_);
-      DiscardAllocation(buffer.first);
+      // We keep the ownership of the device memory here.
+      *shaped_tree.mutable_element(index_buffer.first) = se::OwningDeviceMemory(
+          index_buffer.second->allocation(), device_ordinal_, allocator_);
     }
   }
-  return shaped_tree;
+  return std::move(shaped_tree);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 4d284382532..929c77b3f5c 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
 
+#include <atomic>
 #include <functional>
 #include <memory>
 #include <string>
@@ -27,17 +28,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/compiler/xrt/xrt_refptr.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
 namespace tensorflow {
 
+// Cannot include xrt_memory_manager.h here, as it needs to include this file.
+class XRTMemoryManager;
+
 // TODO(misard) make this a Tensor if and when that makes sense.
 // A reference-counted wrapper around a buffer allocation. This maps an XLA
 // tuple index or a non-tuple XLA shape to a region of device memory. The device
@@ -51,36 +56,23 @@ class XRTBufferAllocation : public core::RefCounted {
   // The region of device memory being wrapped.
   const se::DeviceMemoryBase& allocation();
 
-  // Sets the DeviceMemoryBase to be null. DiscardAllocation should be called
-  // when ownership of the underlying buffer has been transferred, e.g., to an
-  // output buffer when input and output buffers are aliased during
-  // execution. The call to DiscardAllocation prevents any device buffer being
-  // freed when the reference count drops to zero.
-  void DiscardAllocation();
-
-  // Returns the expected size of the allocation. Since DiscardAllocation() will
-  // set allocation_ to {null,0}, and since later we might want to replace the
-  // discarded buffer with a new one, we need to be able to verify the size
-  // compatibility.
-  uint64 size() const { return size_; }
+  void DiscardAllocation() { allocation_ = se::DeviceMemoryBase(); }
 
  private:
-  uint64 size_ = 0;
   se::DeviceMemoryBase allocation_;
   int device_ordinal_;
   se::DeviceMemoryAllocator* allocator_;
 };
 
-// Entry in the resource manager corresponding to an allocation handle returned
-// to a client. The handle identifies an immutable tuple of data in device
-// memory. New handles can be created in three ways: by passing a literal in
-// which case device memory is allocated and the literal is transferred to that
-// memory; by aliasing a sub-shape of an existing tuple-shaped handle; or by
-// aliasing a vector of existing handles to create a new tuple. The underlying
-// storage is reference-counted. When a handle is released, the reference count
-// of each storage buffer is decremented, and buffers with no outstanding
-// references are freed.
-class XRTTupleAllocation : public ResourceBase {
+// A XRTTupleAllocation represents an allocated memory area on the device.
+// New tuples can be created in three ways: by passing a literal in which case
+// device memory is allocated and the literal is transferred to that memory; by
+// aliasing a sub-shape of an existing tuple-shaped handle; or by aliasing a
+// vector of existing handles to create a new tuple. The underlying storage is
+// reference-counted. When a handle is released, the reference count of each
+// storage buffer is decremented, and buffers with no outstanding references are
+// freed.
+class XRTTupleAllocation : public core::RefCounted {
  public:
   ~XRTTupleAllocation() override;
 
@@ -88,6 +80,7 @@ class XRTTupleAllocation : public ResourceBase {
   // literal to that memory, and returns a XRTTupleAllocation handle to the
   // allocated buffers.
   static Status CreateAndTransfer(const xla::LiteralBase& literal,
+                                  XRTMemoryManager* memory_manager,
                                   xla::Backend* backend, int device_ordinal,
                                   XRTTupleAllocation** allocation);
 
@@ -106,16 +99,11 @@ class XRTTupleAllocation : public ResourceBase {
                               XRTTupleAllocation** allocation,
                               bool alias_parent_allocation);
 
-  // Runs a compaction cycle which copies the device data to host, frees the
-  // device data, and then reallocate and send back the data.
-  static Status CompactAllocations(ResourceMgr* rm, xla::Backend* backend,
-                                   int device_ordinal);
-
   // A structure describing a leaf of a tree of tuples to expand. Each leaf
   // contains an allocation and indicates whether or not the allocation's handle
   // should be freed after incorporating its buffers into the expanded tree.
   struct ExpandedTupleInput {
-    XRTTupleAllocation* allocation;
+    RefPtr<XRTTupleAllocation> allocation;
     bool release_allocation_after_use;
   };
 
@@ -129,52 +117,70 @@ class XRTTupleAllocation : public ResourceBase {
   // an input is repeated, release_input_handle must be false for every leaf
   // where that input appears. The latter property is not validated by MakeTuple
   // and must be enforced by the caller.
-  static Status MakeTuple(xla::Backend* backend, int device_ordinal,
+  static Status MakeTuple(XRTMemoryManager* memory_manager,
+                          xla::Backend* backend, int device_ordinal,
                           const xla::ShapeTree<ExpandedTupleInput>& elements,
                           XRTTupleAllocation** allocation);
 
-  // Retrieves the allocation interned under key from rm. The caller owns a
-  // reference to allocation after looking it up.
-  static Status Lookup(ResourceMgr* rm, int64 key,
-                       XRTTupleAllocation** allocation);
-
-  // Deletes the reference in the rm to an allocation interned under key.
-  static Status DeleteFromResourceManager(ResourceMgr* rm, int64 key);
-
-  // Releases all the device memory allocated by XRT within the resource
-  // manager.
-  static Status ReleaseAllAllocations(ResourceMgr* rm);
-
-  // Returns the invalid key value, which will be never generated by the
-  // Intern() API.
-  static int64 InvalidKey() { return 0; }
-
-  // Adds the allocation to a ResourceMgr and returns the key that will be used
-  // to retrieve it. Transfers a reference on *this to rm.
-  Status Intern(ResourceMgr* rm, int64* key);
-
   // Copies the allocation from device to host and returns it in literal.
-  Status ToLiteral(xla::Backend* backend, int device_ordinal,
-                   xla::MutableLiteralBase* literal);
+  Status ToLiteral(xla::Backend* backend, xla::MutableLiteralBase* literal);
 
   // Write a new literal value to the allocation.
   Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
 
+  // Stores the content of the tuple allocation into the internal literal, and
+  // releases all the device buffers. The swap_pinned flag tells whether a
+  // pinned allocation should be swapped out. It should be false on all cases,
+  // but during the memory compaction operation from the XRTMemoryManager.
+  // Returns a boolean telling whether the allocation was swapped out.
+  xla::StatusOr<bool> SwapOut(xla::Backend* backend, bool swap_pinned);
+
+  // Allocates the device memory required to store the tuple value held within
+  // the internal literal, and transfer the literal value into the device
+  // memory. Returns a boolean telling whether the allocation was swapped in.
+  xla::StatusOr<bool> SwapIn(XRTMemoryManager* memory_manager,
+                             xla::Backend* backend);
+
+  // Pins the allocation first, then swap it in (if it is not already). After
+  // this API returns, the allocation is pinned and its content on device
+  // memory. The caller is responsible for releasing the pin-count using the
+  // Unpin() API.
+  xla::StatusOr<bool> PinAndSwapIn(XRTMemoryManager* memory_manager,
+                                   xla::Backend* backend);
+
+  // Checks whether the allocation is currently swapped out.
+  bool IsSwapped() const;
+
+  // Increases the pin-count of this allocation. If the pin-count is greater
+  // than 0, the allocation cannot be swapped. Returned the pin-count value
+  // before the increase.
+  int64 Pin();
+
+  // Decreases the pin-count of this allocation. Returned the pin-count value
+  // before the decrease.
+  int64 Unpin();
+
+  // Checks whether the allocation is currently pinned.
+  bool IsPinned() const;
+
   // True if none of the buffers in the allocation are aliased by any other live
   // handle.
-  bool IsExclusiveOwner();
+  bool IsExclusiveOwner() const;
+
+  // Retrieves the footprint in terms of device memory, of this allocation.
+  size_t GetDeviceMemorySize() const;
 
   // The ordinal of the device holding this tuple.
-  int device_ordinal();
+  int device_ordinal() const;
 
   // Returns the shape of the tuple as seen by the host.
-  const xla::Shape& on_host_shape();
+  const xla::Shape& on_host_shape() const;
 
   // Returns the shape of the tuple as stored on the device.
-  const xla::Shape& on_device_shape();
+  const xla::Shape& on_device_shape() const;
 
   // Returns the buffer pointed to by the root of the tuple.
-  const se::DeviceMemoryBase& root_allocation();
+  const se::DeviceMemoryBase& root_allocation() const;
 
   // Stops managing the storage for the allocation at buffer_index, e.g.,
   // because it has been aliased to the output buffer of a computation.
@@ -182,7 +188,7 @@ class XRTTupleAllocation : public ResourceBase {
 
   // Returns the tree of allocations as a ShapedBuffer. This tree may not have
   // the same shape as on_host_shape.
-  xla::ShapedBuffer ToShapedBuffer();
+  xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer();
 
   // Aliases the source buffer at source_index into the current tuple allocation
   // dest_index.
@@ -191,14 +197,22 @@ class XRTTupleAllocation : public ResourceBase {
                          const xla::ShapeIndex& dest_index);
 
   // Returns the device memory tree of this allocation. If the release_checker
-  // function returns true for a given index, the ownership of the device memory
-  // at that index is transferred to the result. Every attempt to read the value
-  // at that index will fail.
-  xla::ShapeTree<xla::MaybeOwningDeviceMemory> ToDeviceMemoryTree(
+  // function returns true for a given index, an owned device memory is returned
+  // to the caller. But the tuple allocation cannot release the ownership in
+  // full, as the execute operation might fail. So we rely on a call to
+  // AliasBufferFrom() to re-alias back the buffers. This is not great (to say
+  // the least), but the current aliasing logic relies on
+  // MaybeOwningDeviceMemory being owned, to detect the fact that the user may
+  // want to alias a buffer. Unfortunately to do that, it needs to release the
+  // ownership, which is a problem if the execute will fail.
+  // This calls for a refactoring of the whole owning/maybe-owning interface to
+  // introduce a sharing concept (IOW shared_ptr model vs. unique_ptr).
+  // We'd need something similar to XRTTupleAllocation instead of
+  // ScopedShapedBuffer, which wants ownership and does not allow sharing.
+  xla::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>>
+  ToDeviceMemoryTree(
       const std::function<bool(const xla::ShapeIndex&)>& release_checker);
 
-  string DebugString() const override { return "XLA allocation handle"; }
-
  private:
   // Creates a new handle with (tuple) shape.
   XRTTupleAllocation(int device_ordinal, se::DeviceMemoryAllocator* allocator,
@@ -211,6 +225,21 @@ class XRTTupleAllocation : public ResourceBase {
                                   se::DeviceMemoryAllocator* allocator,
                                   int device_ordinal);
 
+  // Releases all the XRTBufferAllocation buffer references and set the
+  // corresponding shape tree entry to nullptr.
+  void ReleaseBuffers();
+
+  // Stores the content of the allocation from device memory to the target host
+  // literal.
+  Status StoreToLiteral(xla::Backend* backend,
+                        xla::MutableLiteralBase* literal);
+
+  // Sets the total size of the buffers held within this allocation buffers.
+  // This API should be called once when an XRTTupleAllocation object is
+  // created, as the XRTTupleAllocation shapes never change, and hence the
+  // device memory size.
+  void SetDeviceMemorySize();
+
   // Takes a tree 'elements' where each leaf is an allocation, validates that
   // they are all on device_ordinal managed by allocator, and returns in
   // host_shape and device_shape the host/device shapes of the expanded tree,
@@ -221,9 +250,13 @@ class XRTTupleAllocation : public ResourceBase {
       se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
       xla::Shape* device_shape);
 
+  // The lock which protects the internal operations of the tuple allocation. Is
+  // mutable to allow const-like operations to be declared as such.
+  mutable mutex lock_;
+
   // Location of the memory that is being managed.
-  int device_ordinal_;
-  se::DeviceMemoryAllocator* allocator_;
+  const int device_ordinal_;
+  se::DeviceMemoryAllocator* const allocator_;
 
   // The shape that the caller thinks the tuple has.
   const xla::Shape on_host_shape_;
@@ -233,6 +266,13 @@ class XRTTupleAllocation : public ResourceBase {
   // The tree of reference-counted buffers, which uses on_device_shape_ as its
   // shape.
   xla::ShapeTree<XRTBufferAllocation*> buffers_;
+  // The footprint of the allocation, when residing on device memory.
+  size_t device_memory_size_ = 0;
+  // If the allocation is swapped out, this is the literal storing its content.
+  std::unique_ptr<xla::Literal> literal_;
+  // A pinned allocation is one which cannot be swapped out. If pin_count_ > 0
+  // then the allocation is pinned.
+  std::atomic<int64> pin_count_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index 518c993f390..baa7112710e 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -25,6 +25,88 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// The ScopedHandles data structure is used in the ExecuteChained() API and its
+// task is to track tuple allocation registrations. It is used both the track
+// intermediate results of a chained computation, or its final results. Anything
+// which is marked to be released, will be released using the XRTMemoryManager
+// once the object is destroyed (unless an explicit call to Drop() or Release()
+// is made).
+class ScopedHandles {
+ public:
+  explicit ScopedHandles(RefPtr<XRTMemoryManager> memory_manager)
+      : memory_manager_(std::move(memory_manager)) {}
+
+  ~ScopedHandles() {
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      if (handles_release_[i]) {
+        memory_manager_->Release(handles_[i]).IgnoreError();
+      }
+    }
+  }
+
+  int64 operator[](size_t index) const { return handles_.at(index); }
+
+  size_t size() const { return handles_.size(); }
+
+  // Adds the given handle at the index position, by marking it releasable
+  // according to the release argument. If an existing, and to-be-released
+  // handle already exists at the same index, it will be released.
+  Status Add(size_t index, int64 handle, bool release) {
+    if (index >= handles_.size()) {
+      handles_.resize(index + 1, XRTMemoryManager::InvalidKey());
+      handles_release_.resize(index + 1, false);
+    }
+    if (handles_release_[index]) {
+      Status status = memory_manager_->Release(handles_[index]);
+      if (!status.ok()) {
+        if (release) {
+          memory_manager_->Release(handle).IgnoreError();
+        }
+        return status;
+      }
+    }
+    handles_[index] = handle;
+    handles_release_[index] = release;
+    return Status::OK();
+  }
+
+  // Adds a to-be-released tuple allocation at the given index.
+  Status Add(size_t index, RefPtr<XRTTupleAllocation> tuple) {
+    return Add(index, memory_manager_->Register(std::move(tuple)),
+               /*release=*/true);
+  }
+
+  // Drops the handle at the given index, and releases it using the
+  // XRTMemoryManager::Release() if marked as to-be-released.
+  Status Drop(size_t index) {
+    if (handles_release_.at(index)) {
+      TF_RETURN_IF_ERROR(memory_manager_->Release(handles_[index]));
+    }
+    Release(index);
+    return Status::OK();
+  }
+
+  // Releases the handle at the given index. The destructor will not use that
+  // XRTMemoryManager::Release() API on such handle.
+  int64 Release(size_t index) {
+    int64 handle = handles_.at(index);
+    handles_[index] = XRTMemoryManager::InvalidKey();
+    handles_release_[index] = false;
+    return handle;
+  }
+
+  // Looks up the handle stored at the given index, and returns the matching
+  // tuple allocation.
+  xla::StatusOr<RefPtr<XRTTupleAllocation>> Lookup(size_t index) const {
+    return memory_manager_->Lookup(handles_.at(index));
+  }
+
+ private:
+  RefPtr<XRTMemoryManager> memory_manager_;
+  std::vector<int64> handles_;
+  std::vector<bool> handles_release_;
+};
+
 bool DebugOptionsPassThroughEnabled() {
   const char* env = getenv("TF_XLA_DEBUG_OPTIONS_PASSTHROUGH");
   bool enabled =
@@ -61,6 +143,23 @@ Status MakeOutput(const RefPtr<XRTTupleAllocation>& output, int64 index,
   return Status::OK();
 }
 
+Status PopulateOpWorkingSet(xla::Backend* backend,
+                            const xrt::XRTChainedExecuteOp& op,
+                            int current_index, const ScopedHandles& outputs,
+                            XRTMemoryManager::WorkingSet* working_set) {
+  for (int i = 0; i < op.inputs_size(); ++i) {
+    auto& input = op.inputs(i);
+    if (input.op_index() >= current_index) {
+      return errors::InvalidArgument(
+          "Input index ", input.op_index(),
+          " is above the current position: ", current_index);
+    }
+    TF_RETURN_IF_ERROR(
+        working_set->LookupAndPin(backend, outputs[input.op_index()]));
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
@@ -81,7 +180,7 @@ xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
 }
 
 xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
-    OpKernelContext* context, ResourceMgr* rm, const char* input_name) {
+    OpKernelContext* context, const char* input_name) {
   OpInputList arg_list;
   TF_RETURN_IF_ERROR(context->input_list(input_name, &arg_list));
   // Concatenate all input uids from list of scalars-or-vectors carrying them.
@@ -102,7 +201,8 @@ xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
   return std::move(input_coords);
 }
 
-Status CreateExecuteOutput(OpKernelContext* context, ResourceMgr* rm,
+Status CreateExecuteOutput(OpKernelContext* context,
+                           XRTMemoryManager* memory_manager,
                            RefPtr<XRTTupleAllocation> output_tuple,
                            bool return_exploded_tuple) {
   if (return_exploded_tuple && output_tuple->on_host_shape().IsTuple()) {
@@ -117,23 +217,21 @@ Status CreateExecuteOutput(OpKernelContext* context, ResourceMgr* rm,
       TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
           output_tuple.get(), {i}, &suballocation,
           /*alias_parent_allocation=*/false));
-      int64 key;
-      TF_RETURN_IF_ERROR(suballocation->Intern(rm, &key));
-      output_tensor->vec<int64>()(i) = key;
+      output_tensor->vec<int64>()(i) = memory_manager->Register(suballocation);
     }
   } else {
     Tensor* output_tensor;
     TF_RETURN_IF_ERROR(
         context->allocate_output(0, TensorShape({}), &output_tensor));
-    int64 key;
-    TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
-    output_tuple.release();
-    output_tensor->scalar<int64>()() = key;
+    output_tensor->scalar<int64>()() =
+        memory_manager->Register(std::move(output_tuple));
   }
   return Status::OK();
 }
 
-Status ExecuteChained(OpKernelContext* context, ResourceMgr* rm,
+Status ExecuteChained(OpKernelContext* context,
+                      const RefPtr<XRTMemoryManager>& memory_manager,
+                      xla::Backend* backend, int device_ordinal,
                       const xrt::XRTChainedExecutePlan& plan,
                       const xrt::XRTChainedExecuteConfig& config,
                       const ChainedExecuteFn& execute_op) {
@@ -145,41 +243,43 @@ Status ExecuteChained(OpKernelContext* context, ResourceMgr* rm,
       uses[input.op_index()] += 1;
     }
   }
-  std::vector<RefPtr<XRTTupleAllocation>> ops_outputs(plan.ops_size());
-  std::vector<RefPtr<XRTTupleAllocation>> results;
+
+  ScopedHandles outputs(memory_manager);
+  ScopedHandles results(memory_manager);
   for (int i = 0; i < plan.ops_size(); ++i) {
     auto& op = plan.ops(i);
     if (op.op_oneof_case() == xrt::XRTChainedExecuteOp::kDataHandle) {
-      // This operation is a device data load. Fetch the proper
-      // XRTTupleAllocation behind the user handle and fill up the op output at
-      // the current position.
-      XRTTupleAllocation* tuple;
-      TF_RETURN_IF_ERROR(
-          XRTTupleAllocation::Lookup(rm, op.data_handle(), &tuple));
-      ops_outputs[i].reset(tuple);
+      // This operation is a device data load. Set the handle as output and
+      // leave the release flag off, since this is not an intermediate output.
+      TF_RETURN_IF_ERROR(outputs.Add(i, op.data_handle(), /*release=*/false));
     } else if (op.op_oneof_case() ==
                xrt::XRTChainedExecuteOp::kComputationHandle) {
       // This is an XRT execute operation, forward to the device specific
-      // handler.
-      TF_ASSIGN_OR_RETURN(ops_outputs[i], execute_op(op, i, ops_outputs));
+      // handler. Populating the working set makes sure the input allocations
+      // for this execute operations are pinned to device memory.
+      XRTMemoryManager::WorkingSet working_set(memory_manager);
+      TF_RETURN_IF_ERROR(
+          PopulateOpWorkingSet(backend, op, i, outputs, &working_set));
+      TF_ASSIGN_OR_RETURN(auto tuple,
+                          execute_op(op, working_set.PinnedTuples()));
+      TF_RETURN_IF_ERROR(outputs.Add(i, std::move(tuple)));
     } else {
       return errors::InvalidArgument(
           "Undefined operation kind at post-order position ", i);
     }
     // If the result of this chained operation is an output result, feed the
-    // results vector at the desired position.
+    // results at the desired position.
     for (auto& output : op.outputs()) {
-      if (output.result_index() >= results.size()) {
-        results.resize(output.result_index() + 1);
-      }
-      TF_RETURN_IF_ERROR(MakeOutput(ops_outputs[i], output.output_index(),
-                                    &results[output.result_index()]));
+      TF_ASSIGN_OR_RETURN(auto tuple, outputs.Lookup(i));
+      RefPtr<XRTTupleAllocation> result;
+      TF_RETURN_IF_ERROR(MakeOutput(tuple, output.output_index(), &result));
+      TF_RETURN_IF_ERROR(results.Add(output.result_index(), std::move(result)));
     }
     // Drop intermediate results which have no more users.
     for (auto& input : op.inputs()) {
       uses[input.op_index()] -= 1;
       if (uses[input.op_index()] == 0) {
-        ops_outputs[input.op_index()].reset();
+        TF_RETURN_IF_ERROR(outputs.Drop(input.op_index()));
       }
     }
   }
@@ -188,12 +288,7 @@ Status ExecuteChained(OpKernelContext* context, ResourceMgr* rm,
   TF_RETURN_IF_ERROR(context->allocate_output(
       0, TensorShape({static_cast<int64>(results.size())}), &output_tensor));
   for (size_t i = 0; i < results.size(); ++i) {
-    int64 key = XRTTupleAllocation::InvalidKey();
-    if (results[i] != nullptr) {
-      TF_RETURN_IF_ERROR(results[i]->Intern(rm, &key));
-      results[i].release();
-    }
-    output_tensor->vec<int64>()(i) = key;
+    output_tensor->vec<int64>()(i) = results.Release(i);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
index 07159dd5677..32244a63081 100644
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -18,97 +18,19 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
 
+#include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xrt/xrt.pb.h"
+#include "tensorflow/compiler/xrt/xrt_memory_manager.h"
+#include "tensorflow/compiler/xrt/xrt_refptr.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
-// Reference counted smart pointer for XRT objects providing the standard
-// Ref()/Unref() APIs.
-template <typename T>
-class RefPtr {
- public:
-  RefPtr() = default;
-  // Creates a RefPtr from a pointer. This is an ownership transfer operation,
-  // and the caller has to own a valid reference to ptr (unless ptr is nullptr).
-  RefPtr(T* ptr) : ptr_(ptr) {}
-  RefPtr(const RefPtr& other) : ptr_(other.ptr_) { Acquire(ptr_); }
-  RefPtr(RefPtr&& other) : ptr_(other.ptr_) { other.ptr_ = nullptr; }
-
-  ~RefPtr() { Release(ptr_); }
-
-  RefPtr& operator=(const RefPtr& other) {
-    if (this != &other) {
-      Acquire(other.ptr_);
-      Release(ptr_);
-      ptr_ = other.ptr_;
-    }
-    return *this;
-  }
-
-  RefPtr& operator=(RefPtr&& other) {
-    if (this != &other) {
-      Release(ptr_);
-      ptr_ = other.ptr_;
-      other.ptr_ = nullptr;
-    }
-    return *this;
-  }
-
-  operator bool() const { return ptr_ != nullptr; }
-  bool operator==(const RefPtr& rhs) const { return ptr_ == rhs.ptr_; }
-  bool operator!=(const RefPtr& rhs) const { return ptr_ != rhs.ptr_; }
-  bool operator==(const T* ptr) const { return ptr_ == ptr; }
-  bool operator!=(const T* ptr) const { return ptr_ != ptr; }
-  bool operator==(std::nullptr_t ptr) const { return ptr_ == ptr; }
-  bool operator!=(std::nullptr_t ptr) const { return ptr_ != ptr; }
-
-  T* get() const { return ptr_; }
-
-  T* operator->() const {
-    CHECK(ptr_ != nullptr);  // Crash OK
-    return ptr_;
-  }
-
-  T& operator*() const {
-    CHECK(ptr_ != nullptr);  // Crash OK
-    return *ptr_;
-  }
-
-  T* release() {
-    T* ptr = ptr_;
-    ptr_ = nullptr;
-    return ptr;
-  }
-
-  // Resets the RefPtr from a pointer. This is an ownership transfer operation,
-  // and the caller has to own a valid reference to ptr (unless ptr is nullptr).
-  void reset(T* ptr = nullptr) {
-    Release(ptr_);
-    ptr_ = ptr;
-  }
-
- private:
-  static void Release(T* ptr) {
-    if (ptr != nullptr) {
-      ptr->Unref();
-    }
-  }
-
-  static void Acquire(T* ptr) {
-    if (ptr != nullptr) {
-      ptr->Ref();
-    }
-  }
-
-  T* ptr_ = nullptr;
-};
-
 struct InputCoords {
   explicit InputCoords(int64 handle) : handle(handle) {}
   InputCoords(int64 handle, xla::ShapeIndex index)
@@ -128,12 +50,13 @@ xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
 // Populates the input_coords with a list of input coordinates from a input_name
 // op argument.
 xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
-    OpKernelContext* context, ResourceMgr* rm, const char* input_name);
+    OpKernelContext* context, const char* input_name);
 
 // Create the XRT execute output tensor given the computation result
 // (output_tuple). The return_exploded_tuple tells whether a tuple result should
 // be returned as vector of handles representing each tuple child.
-Status CreateExecuteOutput(OpKernelContext* context, ResourceMgr* rm,
+Status CreateExecuteOutput(OpKernelContext* context,
+                           XRTMemoryManager* memory_manager,
                            RefPtr<XRTTupleAllocation> output_tuple,
                            bool return_exploded_tuple);
 
@@ -141,9 +64,11 @@ Status CreateExecuteOutput(OpKernelContext* context, ResourceMgr* rm,
 // function.
 using ChainedExecuteFn =
     std::function<xla::StatusOr<RefPtr<XRTTupleAllocation>>(
-        const xrt::XRTChainedExecuteOp&, int,
+        const xrt::XRTChainedExecuteOp&,
         absl::Span<const RefPtr<XRTTupleAllocation>>)>;
-Status ExecuteChained(OpKernelContext* context, ResourceMgr* rm,
+Status ExecuteChained(OpKernelContext* context,
+                      const RefPtr<XRTMemoryManager>& memory_manager,
+                      xla::Backend* backend, int device_ordinal,
                       const xrt::XRTChainedExecutePlan& plan,
                       const xrt::XRTChainedExecuteConfig& config,
                       const ChainedExecuteFn& execute_op);
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 6760ef265d3..c9ee6f9ac83 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:__subpackages__"])
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index f6c6560c1c3..2ebb821f0b3 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -3,9 +3,10 @@
 #   APIs are subject to change.  Eventually to be replaced by equivalent
 #   functionality within TensorFlow core.
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index 5608e7ddafa..3806237cf9e 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -3,16 +3,17 @@
 
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_copts",
     "if_android",
+    "tf_copts",
 )
 
 exports_files([
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.cc b/tensorflow/contrib/android/asset_manager_filesystem.cc
index d14b2126a0f..a5aa950bff6 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.cc
+++ b/tensorflow/contrib/android/asset_manager_filesystem.cc
@@ -27,7 +27,7 @@ namespace {
 string RemoveSuffix(const string& name, const string& suffix) {
   string output(name);
   StringPiece piece(output);
-  str_util::ConsumeSuffix(&piece, suffix);
+  absl::ConsumeSuffix(&piece, suffix);
   return string(piece);
 }
 
@@ -230,7 +230,7 @@ string AssetManagerFileSystem::NormalizeDirectoryPath(const string& fname) {
 
 string AssetManagerFileSystem::RemoveAssetPrefix(const string& name) {
   StringPiece piece(name);
-  str_util::ConsumePrefix(&piece, prefix_);
+  absl::ConsumePrefix(&piece, prefix_);
   return string(piece);
 }
 
diff --git a/tensorflow/contrib/autograph/BUILD b/tensorflow/contrib/autograph/BUILD
index e37ad7a7581..da83008c422 100644
--- a/tensorflow/contrib/autograph/BUILD
+++ b/tensorflow/contrib/autograph/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/BUILD b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
index 651b108e239..0cc42f01fbd 100644
--- a/tensorflow/contrib/autograph/examples/benchmarks/BUILD
+++ b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index bc9b2b05172..3b8e3059501 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "py_test",
@@ -34,6 +33,7 @@ py_test(
     name = "batch_ops_test",
     size = "small",
     srcs = ["python/ops/batch_ops_test.py"],
+    python_version = "PY2",
     shard_count = 5,
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/bigtable/BUILD b/tensorflow/contrib/bigtable/BUILD
index 71538e0770d..bc4c145668f 100644
--- a/tensorflow/contrib/bigtable/BUILD
+++ b/tensorflow/contrib/bigtable/BUILD
@@ -2,19 +2,18 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
     "tf_copts",
     "tf_custom_op_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
     "tf_kernel_library",
-    "tf_cc_test",
     "tf_py_test",
 )
 
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
index 0bdaf3ae0bd..01cedd8d762 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
@@ -73,7 +73,7 @@ string RegexFromStringSet(const std::vector<string>& strs) {
   if (uniq.size() == 1) {
     return *uniq.begin();
   }
-  return str_util::Join(uniq, "|");
+  return absl::StrJoin(uniq, "|");
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 6791e379107..95c08f67e54 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -1,13 +1,14 @@
 # TensorFlow code for training gradient boosted trees.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = [
-    "//visibility:public",
-])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 968aff18053..8a2beede37d 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -1,14 +1,13 @@
 # This directory contains estimators to train and run inference on
 # gradient boosted trees on top of TensorFlow.
 
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 634dfab1090..56c55a4055d 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -1,17 +1,16 @@
 # Description:
 #   This directory contains common utilities used in boosted_trees.
 
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
 package(
     default_visibility = [
         "//tensorflow/contrib/boosted_trees:__subpackages__",
         "//tensorflow/contrib/boosted_trees:friends",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
diff --git a/tensorflow/contrib/boosted_trees/proto/BUILD b/tensorflow/contrib/boosted_trees/proto/BUILD
index b07f0a43142..ed84c7a02d7 100644
--- a/tensorflow/contrib/boosted_trees/proto/BUILD
+++ b/tensorflow/contrib/boosted_trees/proto/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/boosted_trees/resources/BUILD b/tensorflow/contrib/boosted_trees/resources/BUILD
index c0651868453..1205ce55694 100644
--- a/tensorflow/contrib/boosted_trees/resources/BUILD
+++ b/tensorflow/contrib/boosted_trees/resources/BUILD
@@ -1,14 +1,13 @@
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
 package(
     default_visibility = [
         "//tensorflow/contrib/boosted_trees:__subpackages__",
         "//tensorflow/contrib/boosted_trees:friends",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
+exports_files(["LICENSE"])
+
 cc_library(
     name = "stamped_resource",
     hdrs = ["stamped_resource.h"],
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index caedf5b2d1d..aa5c47c6350 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index 523a9efcf05..3a6b6232fb6 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_gen_op_libs",
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index 20f8c2b2453..13a03c81061 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
index 7416eb19d33..cb02cb88a84 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
@@ -30,7 +30,7 @@ constexpr char kTestDataset[] = "test-dataset";
 constexpr char kTestTable[] = "test-table";
 
 bool HasSubstr(StringPiece base, StringPiece substr) {
-  bool ok = str_util::StrContains(base, substr);
+  bool ok = absl::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index f944b7f8843..a552173fb55 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -6,10 +6,9 @@ package(
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 py_library(
     name = "cluster_resolver_pip",
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 174f7d1d47f..c102b327dce 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -16,8 +16,8 @@ include (ExternalProject)
 include (GNUInstallDirs)
 
 set(png_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/png_archive)
-set(png_URL https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.35.tar.gz)
-set(png_HASH SHA256=6d59d6a154ccbb772ec11772cb8f8beb0d382b61e7ccc62435bf7311c9f4b210)
+set(png_URL https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.37.tar.gz)
+set(png_HASH SHA256=ca74a0dace179a8422187671aee97dd3892b53e168627145271cad5b5ac81307)
 set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index cc263d7995c..24e45236a63 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -274,10 +274,9 @@ if (NOT WIN32)
       COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
       ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir} --git_tag_override=${GIT_TAG_OVERRIDE}
       DEPENDS __force_rebuild)
+  set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 endif()
 
-set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
-
 ########################################################
 # tf_core_framework library
 ########################################################
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index 773560fcd0b..2c7c56b361f 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = [":friends"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD
index bd81e36c423..ac5243d525d 100644
--- a/tensorflow/contrib/constrained_optimization/BUILD
+++ b/tensorflow/contrib/constrained_optimization/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/copy_graph/BUILD b/tensorflow/contrib/copy_graph/BUILD
index 6273bcf7a5c..55c75a30e14 100644
--- a/tensorflow/contrib/copy_graph/BUILD
+++ b/tensorflow/contrib/copy_graph/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/crf/BUILD b/tensorflow/contrib/crf/BUILD
index 5c1a17df4f9..c57680f6e4f 100644
--- a/tensorflow/contrib/crf/BUILD
+++ b/tensorflow/contrib/crf/BUILD
@@ -2,12 +2,13 @@
 #   Contains classes to construct a CRF layer
 #   APIs here are meant to evolve over time.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 
 py_library(
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 174d82c1b9a..63f04de3317 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -4,10 +4,9 @@
 
 package(
     default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 38f1c65a4d5..74e3ae067d6 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 10475cf2866..354683505eb 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
index 78019fcc7d8..8132f81c1cd 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
@@ -24,13 +24,11 @@ from tensorflow.contrib.data.python.ops import get_single_element
 from tensorflow.contrib.data.python.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
+@test_util.run_all_in_graph_and_eager_modes
 class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -51,13 +49,10 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     sum_reducer = grouping.Reducer(init_fn, reduce_fn, finalize_fn)
 
-    stop_t = array_ops.placeholder(dtypes.int64, shape=[])
-    dataset = dataset_ops.Dataset.range(stop_t)
+    dataset = dataset_ops.Dataset.range(stop)
     element = get_single_element.reduce_dataset(dataset, sum_reducer)
 
-    with self.cached_session() as sess:
-      value = sess.run(element, feed_dict={stop_t: stop})
-      self.assertEqual(stop * (stop - 1) / 2, value)
+    self.assertEqual(stop * (stop - 1) / 2, self.evaluate(element))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 0fb406f1167..a4176d522dc 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 6a88cc68162..0bff4fb7bcd 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -219,7 +219,7 @@ def assert_element_shape(expected_shapes):
     output_shapes = _merge_output_shapes(
         dataset_ops.get_legacy_output_shapes(dataset), expected_shapes)
     # pylint: disable=protected-access
-    return batching._RestructuredDataset(
+    return dataset_ops._RestructuredDataset(
         dataset.map(_check_shape),
         dataset_ops.get_legacy_output_types(dataset),
         output_shapes=output_shapes,
diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index 06940a90d5c..0f58675af60 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files([
     "LICENSE",
diff --git a/tensorflow/contrib/deprecated/BUILD b/tensorflow/contrib/deprecated/BUILD
index 035d8cfc37e..df747ea2c70 100644
--- a/tensorflow/contrib/deprecated/BUILD
+++ b/tensorflow/contrib/deprecated/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains deprecated functions that we aren't quite ready to remove entirely
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
index 1fa4a9bcee1..112cb4ac54e 100644
--- a/tensorflow/contrib/distribute/BUILD
+++ b/tensorflow/contrib/distribute/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 filegroup(
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index ea48cb390b9..680907252db 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -1,368 +1,5 @@
 # Distribution Strategy
 
-> *NOTE*: This is an experimental feature. The API and performance
-> characteristics are subject to change.
-
-## Overview
-
-[`DistributionStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/DistributionStrategy)
-API is an easy way to distribute your training
-across multiple devices/machines. Our goal is to allow users to use existing
-models and training code with minimal changes to enable distributed training.
-Moreover, we've designed the API in such a way that it works with both eager and
-graph execution.
-
-Currently we support several types of strategies:
-
-* [`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy):
-This does in-graph replication with synchronous training
-on many GPUs on one machine. Essentially, we create copies of all variables in
-the model's layers on each device. We then use all-reduce to combine gradients
-across the devices before applying them to the variables to keep them in sync.
-* [`CollectiveAllReduceStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/CollectiveAllReduceStrategy):
-This is a version of `MirroredStrategy` for multi-worker training. It uses
-a collective op to do all-reduce. This supports between-graph communication and
-synchronization, and delegates the specifics of the all-reduce implementation to
-the runtime (as opposed to encoding it in the graph). This allows it to perform
-optimizations like batching and switch between plugins that support different
-hardware or algorithms. In the future, this strategy will implement
-fault-tolerance to allow training to continue when there is worker failure.
-
-* [`ParameterServerStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/ParameterServerStrategy):
-This strategy supports using parameter servers either for multi-GPU local
-training or asynchronous multi-machine training. When used to train locally,
-variables are not mirrored, instead they are placed on the CPU and operations
-are replicated across all local GPUs. In a multi-machine setting, some are
-designated as workers and some as parameter servers. Each variable is placed on
-one parameter server. Computation operations are replicated across all GPUs of
-the workers.
-
-## Multi-GPU Training
-
-## Example with Keras API
-
-Let's see how to scale to multiple GPUs on one machine using `MirroredStrategy` with [tf.keras] (https://www.tensorflow.org/guide/keras).
-
-Let's define a simple input dataset for training this model. Note that currently we require using
-[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
-with `DistributionStrategy`.
-
-```python
-import tensorflow as tf
-from tensorflow import keras
-
-features = tf.data.Dataset.from_tensors([1.]).repeat(10000).batch(10)
-labels = tf.data.Dataset.from_tensors([1.]).repeat(10000).batch(10)
-train_dataset = tf.data.Dataset.zip((features, labels))
-```
-
-To distribute this Keras model on multiple GPUs using `MirroredStrategy` we
-first instantiate a `MirroredStrategy` object.
-
-```python
-distribution = tf.contrib.distribute.MirroredStrategy()
-```
-
-Take a very simple model consisting of a single layer. We need to create and compile
-the model under the distribution strategy scope.
-
-```python
-with distribution.scope():
-  inputs = tf.keras.layers.Input(shape=(1,))
-  predictions = tf.keras.layers.Dense(1)(inputs)
-  model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
-
-  model.compile(loss='mean_squared_error',
-                optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.2))
-```
-
-To train the model we call Keras `fit` API using the input dataset that we
-created earlier, same as how we would in a non-distributed case.
-
-```python
-model.fit(train_dataset, epochs=5, steps_per_epoch=10)
-```
-
-Similarly, we can also call `evaluate` and `predict` as before using appropriate
-datasets.
-
-```python
-model.evaluate(eval_dataset, steps=1)
-model.predict(predict_dataset, steps=1)
-```
-
-That's all you need to train your model with Keras on multiple GPUs with
-`MirroredStrategy`. It will take care of splitting up
-the input dataset, replicating layers and variables on each device, and
-combining and applying gradients.
-
-The model and input code does not have to change because we have changed the
-underlying components of TensorFlow (such as
-optimizer, batch norm and summaries) to become distribution-aware.
-That means those components know how to
-combine their state across devices. Further, saving and checkpointing works
-seamlessly, so you can save with one or no distribution strategy and resume with
-another.
-
-
-## Example with Estimator API
-
-You can also use Distribution Strategy API with [`Estimator`](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator). Let's see a simple example of it's usage with `MirroredStrategy`.
-
-
-Consider a very simple model function which tries to learn a simple function.
-
-```python
-def model_fn(features, labels, mode):
-  layer = tf.layers.Dense(1)
-  logits = layer(features)
-
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    predictions = {"logits": logits}
-    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
-
-  loss = tf.losses.mean_squared_error(
-      labels=labels, predictions=tf.reshape(logits, []))
-
-  if mode == tf.estimator.ModeKeys.EVAL:
-    return tf.estimator.EstimatorSpec(mode, loss=loss)
-
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-```
-
-Again, let's define a simple input function to feed data for training this model.
-
-
-```python
-def input_fn():
-  features = tf.data.Dataset.from_tensors([[1.]]).repeat(100)
-  labels = tf.data.Dataset.from_tensors(1.).repeat(100)
-  return tf.data.Dataset.zip((features, labels))
-```
-
-Now that we have a model function and input function defined, we can define the
-estimator. To use `MirroredStrategy`, all we need to do is:
-
-* Create an instance of the `MirroredStrategy` class.
-* Pass it to the
-[`RunConfig`](https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig)
-parameter of `Estimator`.
-
-
-```python
-distribution = tf.contrib.distribute.MirroredStrategy()
-config = tf.estimator.RunConfig(train_distribute=distribution)
-classifier = tf.estimator.Estimator(model_fn=model_fn, config=config)
-classifier.train(input_fn=input_fn)
-classifier.evaluate(input_fn=input_fn)
-```
-
-That's it! This change will now configure estimator to run on all GPUs on your
-machine.
-
-
-## Customization and Performance Tips
-
-Above, we showed the easiest way to use [`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy#__init__).
-There are few things you can customize in practice:
-
-* You can specify a list of specific GPUs (using param `devices`) or the number
-of GPUs (using param `num_gpus`), in case you don't want auto detection.
-* You can specify various parameters for all reduce with the `cross_tower_ops`
-param, such as the all reduce algorithm to use, and gradient repacking.
-
-We've tried to make it such that you get the best performance for your existing
-model. We also recommend you follow the tips from
-[Input Pipeline Performance Guide](https://www.tensorflow.org/performance/datasets_performance).
-Specifically, we found using [`map_and_batch`](https://www.tensorflow.org/performance/datasets_performance#map_and_batch)
-and [`dataset.prefetch`](https://www.tensorflow.org/performance/datasets_performance#pipelining)
-in the input function gives a solid boost in performance. When using
-`dataset.prefetch`, use `buffer_size=None` to let it detect optimal buffer size.
-
-## Multi-worker Training
-### Overview
-
-For multi-worker training, no code change is required to the `Estimator` code.
-You can run the same model code for all tasks in your cluster including
-parameter servers and the evaluator. But you need to use
-`tf.estimator.train_and_evaluate`, explicitly specify `num_gpus_per_worker`
-for your strategy object, and set "TF\_CONFIG" environment variables for each
-binary running in your cluster. We'll provide a Kubernetes template in the
-[tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) repo which sets
-"TF\_CONFIG" for your training tasks.
-
-### TF\_CONFIG environment variable
-
-The "TF\_CONFIG" environment variables is a JSON string which specifies what
-tasks constitute a cluster, their addresses and each task's role in the cluster.
-One example of "TF\_CONFIG" is:
-
-```python
-TF_CONFIG='{
-    "cluster": {
-        "worker": ["host1:port", "host2:port", "host3:port"],
-        "ps": ["host4:port", "host5:port"]
-    },
-   "task": {"type": "worker", "index": 1}
-}'
-```
-
-This "TF\_CONFIG" specifies that there are three workers and two ps tasks in the
-cluster along with their hosts and ports. The "task" part specifies that the
-role of the current task in the cluster, worker 1. Valid roles in a cluster is
-"chief", "worker", "ps" and "evaluator". There should be no "ps" job for
-`CollectiveAllReduceStrategy` and `MirroredStrategy`. The "evaluator" job is
-optional and can have at most one task. It does single machine evaluation and if
-you don't want to do evaluation, you can pass in a dummy `input_fn` to the
-`tf.estimator.EvalSpec` of `tf.estimator.train_and_evaluate`.
-
-### Dataset
-
-The `input_fn` you provide to estimator code is for one worker. So remember to
-scale up your batch if you have multiple GPUs on each worker.
-
-The same `input_fn` will be used for all workers if you use
-`CollectiveAllReduceStrategy` and `ParameterServerStrategy`. Therefore it is
-important to shuffle your dataset in your `input_fn`.
-
-`MirroredStrategy` will insert a `tf.dataset.Dataset.shard` call in you
-`input_fn` if `auto_shard_dataset` is set to `True`. As a result, each worker
-gets a fraction of your input data.
-
-### Performance Tips
-
-We have been actively working on multi-worker performance. Currently, prefer
-`CollectiveAllReduceStrategy` for synchronous multi-worker training.
-
-### Example
-
-Let's use the same example for multi-worker. We'll start a cluster with 3
-workers doing synchronous all-reduce training. In the following code snippet, we
-start multi-worker training using `tf.estimator.train_and_evaluate`:
-
-```python
-def model_main():
-  distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
-      num_gpus_per_worker=2)
-  config = tf.estimator.RunConfig(train_distribute=distribution)
-  estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
-  train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
-  eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
-  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
-```
-
-**Note**: You don't have to set "TF\_CONFIG" manually if you use our provided
-Kubernetes template.
-
-You'll then need 3 machines, find out their host addresses and one available
-port on each machine. Then set  "TF\_CONFIG" in each binary and run the above
-model code.
-
-In your worker 0, run:
-
-```python
-os.environ["TF_CONFIG"] = json.dumps({
-    "cluster": {
-        "worker": ["host1:port", "host2:port", "host3:port"]
-    },
-   "task": {"type": "worker", "index": 0}
-})
-
-# Call the model_main function defined above.
-model_main()
-```
-
-In your worker 1, run:
-
-```python
-os.environ["TF_CONFIG"] = json.dumps({
-    "cluster": {
-        "worker": ["host1:port", "host2:port", "host3:port"]
-    },
-   "task": {"type": "worker", "index": 1}
-})
-
-# Call the model_main function defined above.
-model_main()
-```
-
-In your worker 2, run:
-
-```python
-os.environ["TF_CONFIG"] = json.dumps({
-    "cluster": {
-        "worker": ["host1:port", "host2:port", "host3:port"]
-    },
-   "task": {"type": "worker", "index": 2}
-})
-
-# Call the model_main function defined above.
-model_main()
-```
-
-Then you'll find your cluster has started training! You can inspect the logs of
-workers or start a tensorboard.
-
-### Standalone client mode
-
-We have a new way to run distributed training. You can bring up standard
-tensorflow servers in your cluster and run your model code anywhere such as on
-your laptop.
-
-In the above example, instead of calling `model_main`, you can call
-`tf.contrib.distribute.run_standard_tensorflow_server().join()`. This will bring
-up a cluster running standard tensorflow servers which wait for your request to
-start training.
-
-On your laptop, you can run
-
-```python
-distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
-    num_gpus_per_worker=2)
-config = tf.estimator.RunConfig(
-    experimental_distribute=tf.contrib.distribute.DistributeConfig(
-        train_distribute=distribution,
-        remote_cluster={"worker": ["host1:port", "host2:port", "host3:port"]}))
-estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
-train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
-eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
-tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
-```
-
-Then you will see the training logs on your laptop. You can terminate the
-training by terminating your process on your laptop. You can also modify your
-code and run a new model against the same cluster.
-
-We've been optimizing the performance of standalone client mode. If you notice
-high latency between your laptop and your cluster, you can reduce that latency
-by running your model binary in the cluster.
-
-## Caveats
-
-This feature is in early stages and there are a lot of improvements forthcoming:
-
-* Summaries are only computed in the first tower in `MirroredStrategy`.
-* Eager support is in the works; performance can be more challenging with eager
-execution.
-* We currently support the following predefined Keras callbacks:
-`ModelCheckpointCallback`, `TensorBoardCallback`. We will soon be adding support for
-some of the other callbacks such as `EarlyStopping`, `ReduceLROnPlateau`, etc. If you
-create your own callback, you will not have access to all model properties and
-validation data.
-* If you are [`batching`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch)
-your input data, we will place one batch on each GPU in each step. So your
-effective batch size will be `num_gpus * batch_size`. Therefore, consider
-adjusting your learning rate or batch size according to the number of GPUs.
-We are working on addressing this limitation by splitting each batch across GPUs
-instead.
-* PartitionedVariables are not supported yet.
-
-## What's next?
-
-Please give distribution strategies a try. This feature is in early stages and
-is evolving, so we welcome your feedback via
-[issues on GitHub](https://github.com/tensorflow/tensorflow/issues/new).
-
-
+See the guide for overview and examples:
+[TensorFlow v1.x](https://www.tensorflow.org/guide/distribute_strategy),
+[TensorFlow v2.x](https://www.tensorflow.org/alpha/guide/distribute_strategy).
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index c5ddf6b5533..ecece6b1ef2 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -8,10 +8,9 @@ package(
     default_visibility = [
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_library(
@@ -86,6 +85,7 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/133330625)
     ],
 )
 
@@ -227,64 +227,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "estimator_integration_test",
-    srcs = ["estimator_integration_test.py"],
-    additional_deps = [
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:strategy_combinations",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/optimizer_v2:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
-    ],
-    tags = [
-        "multi_and_single_gpu",
-        "no_oss",  # http://b/119349471
-        "tf_integration_test",
-    ],
-)
-
-cuda_py_test(
-    name = "estimator_training_test",
-    srcs = ["estimator_training_test.py"],
-    additional_deps = [
-        ":collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:strategy_combinations",
-        ":mirrored_strategy",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        ":parameter_server_strategy",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/optimizer_v2:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:distribute_config",
-        "//tensorflow/python/distribute:distribute_coordinator",
-        "//tensorflow/python/distribute:distribute_coordinator_context",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
-    ],
-    shard_count = 48,
-    tags = [
-        "multi_and_single_gpu",
-        # TODO(b/118768923): Re-enable {a,m,t}san test.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "no_oss",  # http://b/119349471
-    ],
-)
-
 py_library(
     name = "monitor",
     srcs = ["monitor.py"],
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index d6eff47fdc5..588fa47c6ae 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -498,8 +498,13 @@ class DistributedCollectiveAllReduceStrategyTest(
       self.assertEqual('grpc', server_def.protocol)
       mock_called[0] = True
 
+    def mock_configure_collective_ops(*args, **kwargs):
+      del args, kwargs
+
     with test.mock.patch.object(context.context(), 'enable_collective_ops',
-                                mock_enable_collective_ops):
+                                mock_enable_collective_ops), \
+         test.mock.patch.object(context.context(), 'configure_collective_ops',
+                                mock_configure_collective_ops):
       strategy, _, _ = self._get_test_object(
           task_type='worker', task_id=1, num_gpus=2, use_core_strategy=True)
     self.assertTrue(strategy.extended._std_server_started)
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
deleted file mode 100644
index c46616ce60f..00000000000
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests that show that DistributionStrategy works with canned Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-from absl.testing import parameterized
-import numpy as np
-from tensorflow.contrib.optimizer_v2 import adagrad
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import test
-from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import training
-from tensorflow.python.estimator.canned import dnn_linear_combined
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column_lib as feature_column
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.summary.writer import writer_cache
-
-
-class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
-                                                 parameterized.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def dataset_input_fn(self, x, y, batch_size, shuffle):
-
-    def input_fn():
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      if shuffle:
-        dataset = dataset.shuffle(batch_size)
-      dataset = dataset.repeat(10).batch(batch_size)
-      return dataset
-
-    return input_fn
-
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          distribution=[
-              strategy_combinations.one_device_strategy,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.mirrored_strategy_with_two_gpus,
-          ],
-          use_train_and_evaluate=[True, False]))
-  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    train_input_fn = self.dataset_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size // distribution.num_replicas_in_sync,
-        shuffle=True)
-    eval_input_fn = self.dataset_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size // distribution.num_replicas_in_sync,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, batch_size=batch_size, shuffle=False)
-
-    linear_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    dnn_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    feature_columns = linear_feature_columns + dnn_feature_columns
-    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
-        linear_feature_columns=linear_feature_columns,
-        dnn_hidden_units=(2, 2),
-        dnn_feature_columns=dnn_feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir,
-        # TODO(isaprykin): Work around the colocate_with error.
-        dnn_optimizer=adagrad.AdagradOptimizer(0.001),
-        linear_optimizer=adagrad.AdagradOptimizer(0.001),
-        config=run_config.RunConfig(
-            train_distribute=distribution, eval_distribute=distribution))
-
-    num_steps = 10
-    if use_train_and_evaluate:
-      scores, _ = training.train_and_evaluate(
-          estimator,
-          training.TrainSpec(train_input_fn, max_steps=num_steps),
-          training.EvalSpec(eval_input_fn))
-    else:
-      estimator.train(train_input_fn, steps=num_steps)
-      scores = estimator.evaluate(eval_input_fn)
-
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', scores)
-
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in estimator.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = estimator.export_saved_model(tempfile.mkdtemp(),
-                                              serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
deleted file mode 100644
index 9eebdfd68d8..00000000000
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ /dev/null
@@ -1,620 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests that show Distribute Coordinator works with Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import glob
-import json
-import os
-import sys
-import tempfile
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import parameter_server_strategy
-from tensorflow.contrib.optimizer_v2 import adagrad
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import estimator_training as dc_training
-from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute.distribute_config import DistributeConfig
-from tensorflow.python.eager import context
-from tensorflow.python.estimator import exporter as exporter_lib
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.estimator import training as estimator_training
-from tensorflow.python.estimator.canned import dnn_linear_combined
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export as export_lib
-from tensorflow.python.feature_column import feature_column_lib as feature_column
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary import summary_iterator
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import session_manager
-
-
-BATCH_SIZE = 10
-LABEL_DIMENSION = 2
-DATA = np.linspace(
-    0., 2., BATCH_SIZE * LABEL_DIMENSION, dtype=np.float32).reshape(
-        BATCH_SIZE, LABEL_DIMENSION)
-EVAL_NAME = "foo"
-EXPORTER_NAME = "saved_model_exporter"
-MAX_STEPS = 10
-
-CHIEF = dc._TaskType.CHIEF
-EVALUATOR = dc._TaskType.EVALUATOR
-WORKER = dc._TaskType.WORKER
-PS = dc._TaskType.PS
-
-original_run_std_server = dc._run_std_server
-
-
-class DistributeCoordinatorIntegrationTest(
-    multi_worker_test_base.IndependentWorkerTestBase, parameterized.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    """Create a local cluster with 2 workers."""
-    super(DistributeCoordinatorIntegrationTest, cls).setUpClass()
-    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
-        num_workers=3, num_ps=2, has_eval=True)
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-    super(DistributeCoordinatorIntegrationTest, self).setUp()
-
-  def dataset_input_fn(self, x, y, batch_size, shuffle):
-
-    def input_fn():
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      if shuffle:
-        dataset = dataset.shuffle(batch_size)
-      dataset = dataset.repeat(100).batch(batch_size)
-      return dataset
-
-    return input_fn
-
-  def _get_exporter(self, name, fc):
-    feature_spec = feature_column.make_parse_example_spec(fc)
-    serving_input_receiver_fn = (
-        export_lib.build_parsing_serving_input_receiver_fn(feature_spec))
-    return exporter_lib.LatestExporter(
-        name, serving_input_receiver_fn=serving_input_receiver_fn)
-
-  def _extract_loss_and_global_step(self, event_folder):
-    """Returns the loss and global step in last event."""
-    event_paths = glob.glob(os.path.join(event_folder, "events*"))
-    self.assertNotEmpty(
-        event_paths, msg="Event file not found in dir %s" % event_folder)
-
-    loss = None
-    global_step_count = None
-
-    for e in summary_iterator.summary_iterator(event_paths[-1]):
-      current_loss = None
-      for v in e.summary.value:
-        if v.tag == "loss":
-          current_loss = v.simple_value
-
-      # If loss is not found, global step is meaningless.
-      if current_loss is None:
-        continue
-
-      current_global_step = e.step
-      if global_step_count is None or current_global_step > global_step_count:
-        global_step_count = current_global_step
-        loss = current_loss
-
-    return (loss, global_step_count)
-
-  def _get_estimator(self,
-                     train_distribute,
-                     eval_distribute,
-                     remote_cluster=None):
-    input_dimension = LABEL_DIMENSION
-    linear_feature_columns = [
-        feature_column.numeric_column("x", shape=(input_dimension,))
-    ]
-    dnn_feature_columns = [
-        feature_column.numeric_column("x", shape=(input_dimension,))
-    ]
-
-    return dnn_linear_combined.DNNLinearCombinedRegressor(
-        linear_feature_columns=linear_feature_columns,
-        dnn_hidden_units=(2, 2),
-        dnn_feature_columns=dnn_feature_columns,
-        label_dimension=LABEL_DIMENSION,
-        model_dir=self._model_dir,
-        dnn_optimizer=adagrad.AdagradOptimizer(0.001),
-        linear_optimizer=adagrad.AdagradOptimizer(0.001),
-        config=run_config_lib.RunConfig(
-            experimental_distribute=DistributeConfig(
-                train_distribute=train_distribute,
-                eval_distribute=eval_distribute,
-                remote_cluster=remote_cluster)))
-
-  def _complete_flow(self,
-                     train_distribute,
-                     eval_distribute,
-                     remote_cluster=None,
-                     use_train_and_evaluate=True):
-    estimator = self._get_estimator(train_distribute, eval_distribute,
-                                    remote_cluster)
-
-    input_dimension = LABEL_DIMENSION
-    train_input_fn = self.dataset_input_fn(
-        x={"x": DATA},
-        y=DATA,
-        batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync,
-        shuffle=True)
-    if eval_distribute:
-      eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync
-    else:
-      eval_batch_size = BATCH_SIZE
-    eval_input_fn = self.dataset_input_fn(
-        x={"x": DATA}, y=DATA, batch_size=eval_batch_size, shuffle=False)
-
-    linear_feature_columns = [
-        feature_column.numeric_column("x", shape=(input_dimension,))
-    ]
-    dnn_feature_columns = [
-        feature_column.numeric_column("x", shape=(input_dimension,))
-    ]
-    feature_columns = linear_feature_columns + dnn_feature_columns
-
-    eval_spec = estimator_training.EvalSpec(
-        name=EVAL_NAME,
-        input_fn=eval_input_fn,
-        steps=None,
-        exporters=self._get_exporter(EXPORTER_NAME, feature_columns),
-        start_delay_secs=0,
-        throttle_secs=1)
-
-    if use_train_and_evaluate:
-      estimator_training.train_and_evaluate(
-          estimator,
-          estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS),
-          eval_spec)
-    else:
-      estimator.train(train_input_fn, max_steps=MAX_STEPS)
-
-      latest_ckpt_path = estimator.latest_checkpoint()
-      metrics = estimator.evaluate(eval_input_fn,
-                                   checkpoint_path=latest_ckpt_path,
-                                   name=EVAL_NAME)
-
-      # Export the eval result to files.
-      eval_result = estimator_training._EvalResult(
-          status=estimator_training._EvalStatus.EVALUATED,
-          metrics=metrics,
-          checkpoint_path=latest_ckpt_path)
-      evaluator = estimator_training._TrainingExecutor._Evaluator(estimator,
-                                                                  eval_spec,
-                                                                  None)
-      evaluator._export_eval_result(eval_result, True)
-
-    return estimator
-
-  def _inspect_train_and_eval_events(self, estimator):
-    # Make sure nothing is stuck in limbo.
-    writer_cache.FileWriterCache.clear()
-
-    # Examine the training events. Use a range to check global step to avoid
-    # flakyness due to global step race condition.
-    training_loss, _ = self._extract_loss_and_global_step(self._model_dir)
-    self.assertIsNotNone(training_loss)
-
-    # Examine the eval events. The global step should be accurate.
-    eval_dir = os.path.join(self._model_dir, "eval_" + EVAL_NAME)
-    eval_loss, eval_global_step = self._extract_loss_and_global_step(
-        event_folder=eval_dir)
-    self.assertIsNotNone(eval_loss)
-    self.assertGreaterEqual(eval_global_step, MAX_STEPS)
-
-    # Examine the export folder.
-    export_dir = os.path.join(
-        os.path.join(self._model_dir, "export"), EXPORTER_NAME)
-    self.assertTrue(gfile.Exists(export_dir))
-
-    # Examine the ckpt for predict.
-    def predict_input_fn():
-      return dataset_ops.Dataset.from_tensor_slices({
-          "x": DATA
-      }).batch(BATCH_SIZE)
-
-    predicted_proba = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in estimator.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((BATCH_SIZE, LABEL_DIMENSION), predicted_proba.shape)
-
-  def _make_cross_device_ops(self, num_gpus_per_worker):
-    return cross_device_ops_lib.MultiWorkerAllReduce(
-        ["/job:worker/task:0", "/job:worker/task:1", "/job:worker/task:2"],
-        num_gpus_per_worker)
-
-  def _get_strategy_object(self, strategy_cls, eval_strategy=False):
-    if strategy_cls == mirrored_strategy.CoreMirroredStrategy:
-      if eval_strategy:
-        return strategy_cls()
-      else:
-        return strategy_cls(
-            cross_device_ops=self._make_cross_device_ops(
-                num_gpus_per_worker=context.num_gpus()))
-    elif (strategy_cls == mirrored_strategy.MirroredStrategy and
-          not eval_strategy):
-      return strategy_cls(
-          num_gpus_per_worker=context.num_gpus(),
-          cross_device_ops=self._make_cross_device_ops(
-              num_gpus_per_worker=context.num_gpus()))
-    else:
-      return strategy_cls(num_gpus_per_worker=context.num_gpus())
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          train_distribute_cls=[
-              collective_all_reduce_strategy.CollectiveAllReduceStrategy,
-              mirrored_strategy.MirroredStrategy,
-              mirrored_strategy.CoreMirroredStrategy,
-              parameter_server_strategy.ParameterServerStrategy
-          ],
-          eval_distribute_cls=[
-              None,
-              mirrored_strategy.MirroredStrategy,
-              mirrored_strategy.CoreMirroredStrategy,
-              parameter_server_strategy.ParameterServerStrategy,
-              collective_all_reduce_strategy.CollectiveAllReduceStrategy,
-          ],
-          required_gpus=[0, 1]))
-  def test_complete_flow_standalone_client(self, train_distribute_cls,
-                                           eval_distribute_cls):
-    train_distribute = self._get_strategy_object(train_distribute_cls)
-
-    if eval_distribute_cls:
-      eval_distribute = self._get_strategy_object(
-          eval_distribute_cls, eval_strategy=True)
-    else:
-      eval_distribute = None
-
-    cluster_spec = copy.deepcopy(self._cluster_spec)
-    if (train_distribute_cls !=
-        parameter_server_strategy.ParameterServerStrategy):
-      cluster_spec.pop("ps", None)
-    estimator = self._complete_flow(train_distribute, eval_distribute,
-                                    cluster_spec)
-    self._inspect_train_and_eval_events(estimator)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          eval_distribute_class=[
-              None,
-              mirrored_strategy.MirroredStrategy,
-              mirrored_strategy.CoreMirroredStrategy,
-              parameter_server_strategy.ParameterServerStrategy,
-          ],
-          required_gpus=[0, 1]))
-  def test_complete_flow_standalone_client_collective_nccl(
-      self, eval_distribute_class):
-    train_distribute = (
-        collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-            num_gpus_per_worker=context.num_gpus(),
-            communication=cross_device_ops_lib.CollectiveCommunication.NCCL))
-
-    if eval_distribute_class:
-      eval_distribute = self._get_strategy_object(
-          eval_distribute_class, eval_strategy=True)
-    else:
-      eval_distribute = None
-
-    cluster_spec = copy.deepcopy(self._cluster_spec)
-    cluster_spec.pop("ps", None)
-    estimator = self._complete_flow(train_distribute, eval_distribute,
-                                    cluster_spec)
-    self._inspect_train_and_eval_events(estimator)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          train_distribute_cls=[
-              mirrored_strategy.MirroredStrategy,
-              mirrored_strategy.CoreMirroredStrategy,
-          ],
-          eval_distribute_cls=[
-              None,
-              mirrored_strategy.MirroredStrategy,
-              mirrored_strategy.CoreMirroredStrategy,
-          ],
-          required_gpus=[0, 1]))
-  def test_estimator_standalone_client(self, train_distribute_cls,
-                                       eval_distribute_cls):
-    train_distribute = self._get_strategy_object(train_distribute_cls)
-
-    if eval_distribute_cls:
-      eval_distribute = self._get_strategy_object(eval_distribute_cls)
-    else:
-      eval_distribute = None
-
-    # We use the whole cluster for evaluation.
-    cluster = copy.deepcopy(self._cluster_spec)
-    cluster.pop("evaluator", None)
-
-    estimator = self._complete_flow(
-        train_distribute, eval_distribute, remote_cluster=cluster,
-        use_train_and_evaluate=False)
-    self._inspect_train_and_eval_events(estimator)
-
-  def _mock_run_std_server(self, *args, **kwargs):
-    ret = original_run_std_server(*args, **kwargs)
-    # Wait for all std servers to be brought up in order to reduce the chance of
-    # remote sessions taking local ports that have been assigned to std servers.
-    self._barrier.wait()
-    return ret
-
-  def _independent_worker_fn(
-      self,
-      train_distribute,
-      eval_distribute,
-  ):
-    with test.mock.patch.object(dc, "_run_std_server",
-                                self._mock_run_std_server):
-      self._complete_flow(train_distribute, eval_distribute)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          train_distribute_cls=[
-              collective_all_reduce_strategy.CollectiveAllReduceStrategy,
-              parameter_server_strategy.ParameterServerStrategy,
-          ],
-          eval_distribute_cls=[
-              None,
-              mirrored_strategy.MirroredStrategy,
-              mirrored_strategy.CoreMirroredStrategy,
-              parameter_server_strategy.ParameterServerStrategy,
-              collective_all_reduce_strategy.CollectiveAllReduceStrategy,
-          ],
-          required_gpus=[0, 1]))
-  def test_complete_flow_independent_worker_between_graph(
-      self, train_distribute_cls, eval_distribute_cls):
-    if (context.num_gpus() < 2 and eval_distribute_cls ==
-        collective_all_reduce_strategy.CollectiveAllReduceStrategy):
-      self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.")
-
-    train_distribute = self._get_strategy_object(train_distribute_cls)
-
-    if eval_distribute_cls:
-      eval_distribute = self._get_strategy_object(
-          eval_distribute_cls, eval_strategy=True)
-    else:
-      eval_distribute = None
-
-    if (train_distribute_cls == parameter_server_strategy
-        .ParameterServerStrategy):
-      cluster_spec = multi_worker_test_base.create_cluster_spec(
-          num_workers=3, num_ps=2, has_eval=True)
-      # 3 workers, 2 ps and 1 evaluator.
-      self._barrier = dc._Barrier(6)
-    else:
-      cluster_spec = multi_worker_test_base.create_cluster_spec(
-          num_workers=3, num_ps=0, has_eval=True)
-      # 3 workers and 1 evaluator.
-      self._barrier = dc._Barrier(4)
-
-    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
-                                                 cluster_spec, train_distribute,
-                                                 eval_distribute)
-    threads_to_join = []
-    for task_type, ts in threads.items():
-      if task_type == PS:
-        continue
-      for t in ts:
-        threads_to_join.append(t)
-    self.join_independent_workers(threads_to_join)
-
-    estimator = self._get_estimator(train_distribute, eval_distribute)
-    self._inspect_train_and_eval_events(estimator)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          train_distribute_cls=[
-              mirrored_strategy.MirroredStrategy,
-              mirrored_strategy.CoreMirroredStrategy
-          ],
-          eval_distribute_cls=[
-              None,
-              mirrored_strategy.MirroredStrategy,
-              mirrored_strategy.CoreMirroredStrategy
-          ],
-          required_gpus=[0, 1]))
-  def test_complete_flow_independent_worker_in_graph(self, train_distribute_cls,
-                                                     eval_distribute_cls):
-    train_distribute = self._get_strategy_object(train_distribute_cls)
-
-    if eval_distribute_cls:
-      eval_distribute = self._get_strategy_object(
-          eval_distribute_cls, eval_strategy=True)
-    else:
-      eval_distribute = None
-
-    cluster_spec = multi_worker_test_base.create_cluster_spec(
-        num_workers=3, num_ps=0, has_eval=True)
-    # 3 workers and 1 evaluator.
-    self._barrier = dc._Barrier(4)
-    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
-                                                 cluster_spec, train_distribute,
-                                                 eval_distribute)
-    self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]])
-
-    estimator = self._get_estimator(train_distribute, eval_distribute)
-    self._inspect_train_and_eval_events(estimator)
-
-
-TF_CONFIG_WITH_CHIEF = {
-    "cluster": {
-        "chief": ["fake_chief"],
-    },
-    "task": {
-        "type": "chief",
-        "index": 0
-    }
-}
-
-TF_CONFIG_WITH_MASTER = {
-    "cluster": {
-        "master": ["fake_master"],
-    },
-    "task": {
-        "type": "master",
-        "index": 0
-    }
-}
-
-TF_CONFIG_WITHOUT_TASK = {"cluster": {"chief": ["fake_worker"]}}
-
-
-class RunConfigTest(test.TestCase):
-
-  def test_previously_unexpected_cluster_spec(self):
-    with test.mock.patch.dict(
-        "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}):
-      run_config_lib.RunConfig(
-          experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.CoreMirroredStrategy(
-                  ["/device:GPU:0", "/device:GPU:1"])))
-
-  def test_should_run_distribute_coordinator(self):
-    """Tests that should_run_distribute_coordinator return a correct value."""
-    # We don't use distribute coordinator for local training.
-    self.assertFalse(
-        dc_training.should_run_distribute_coordinator(
-            run_config_lib.RunConfig()))
-
-    # When `train_distribute` is not specified, don't use distribute
-    # coordinator.
-    with test.mock.patch.dict("os.environ",
-                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
-      self.assertFalse(
-          dc_training.should_run_distribute_coordinator(
-              run_config_lib.RunConfig()))
-
-    # When `train_distribute` is specified and TF_CONFIG is detected, use
-    # distribute coordinator.
-    with test.mock.patch.dict("os.environ",
-                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
-      config_with_train_distribute = run_config_lib.RunConfig(
-          experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.CoreMirroredStrategy(
-                  ["/device:GPU:0", "/device:GPU:1"])))
-      config_with_eval_distribute = run_config_lib.RunConfig(
-          experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.CoreMirroredStrategy(
-                  ["/device:GPU:0", "/device:GPU:1"])))
-    self.assertTrue(
-        dc_training.should_run_distribute_coordinator(
-            config_with_train_distribute))
-    self.assertFalse(
-        dc_training.should_run_distribute_coordinator(
-            config_with_eval_distribute))
-
-    # With a master in the cluster, don't run distribute coordinator.
-    with test.mock.patch.dict("os.environ",
-                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
-      config = run_config_lib.RunConfig(
-          experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.CoreMirroredStrategy(
-                  ["/device:GPU:0", "/device:GPU:1"])))
-    self.assertFalse(dc_training.should_run_distribute_coordinator(config))
-
-  def test_init_run_config_duplicate_distribute(self):
-    with self.assertRaises(ValueError):
-      run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.CoreMirroredStrategy(),
-          experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.CoreMirroredStrategy()))
-
-    with self.assertRaises(ValueError):
-      run_config_lib.RunConfig(
-          eval_distribute=mirrored_strategy.CoreMirroredStrategy(),
-          experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.CoreMirroredStrategy()))
-
-  def test_init_run_config_none_distribute_coordinator_mode(self):
-    # We don't use distribute coordinator for local training.
-    config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.CoreMirroredStrategy())
-    dc_training.init_run_config(config, {})
-    self.assertIsNone(config._distribute_coordinator_mode)
-
-    # With a master in the cluster, don't run distribute coordinator.
-    with test.mock.patch.dict("os.environ",
-                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
-      config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.CoreMirroredStrategy())
-      self.assertIsNone(config._distribute_coordinator_mode)
-
-    # When `train_distribute` is not specified, don't use distribute
-    # coordinator.
-    with test.mock.patch.dict("os.environ",
-                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
-      config = run_config_lib.RunConfig()
-      self.assertFalse(hasattr(config, "_distribute_coordinator_mode"))
-
-  def test_init_run_config_independent_worker(self):
-    # When `train_distribute` is specified and TF_CONFIG is detected, use
-    # distribute coordinator with INDEPENDENT_WORKER mode.
-    with test.mock.patch.dict("os.environ",
-                              {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
-      config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.CoreMirroredStrategy())
-    self.assertEqual(config._distribute_coordinator_mode,
-                     dc.CoordinatorMode.INDEPENDENT_WORKER)
-
-  def test_init_run_config_standalone_client(self):
-    # When `train_distribute` is specified, TF_CONFIG is detected and
-    # `experimental.remote_cluster` is set use distribute coordinator with
-    # STANDALONE_CLIENT mode.
-    config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.CoreMirroredStrategy(),
-        experimental_distribute=DistributeConfig(
-            remote_cluster={"chief": ["fake_worker"]}))
-    self.assertEqual(config._distribute_coordinator_mode,
-                     dc.CoordinatorMode.STANDALONE_CLIENT)
-
-
-if __name__ == "__main__":
-  # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
-  orig_init = session_manager.SessionManager.__init__
-
-  def new_init(*args, **kwargs):
-    kwargs.pop("recovery_wait_secs", None)
-    kwargs["recovery_wait_secs"] = 0.5
-    orig_init(*args, **kwargs)
-
-  session_manager.SessionManager.__init__ = new_init
-
-  with test.mock.patch.object(sys, "exit", os._exit):
-    test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
index 75fbc3bf53f..afabba7bfb4 100644
--- a/tensorflow/contrib/distribute/python/examples/BUILD
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -4,10 +4,9 @@ package(
     default_visibility = [
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_binary(
diff --git a/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py b/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
index c045a5586b9..502f94c5728 100644
--- a/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
+++ b/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
@@ -37,8 +37,6 @@ flags.DEFINE_integer("batch_size", 64,
 flags.DEFINE_integer("num_epochs", 10, "How many epochs to run?")
 flags.DEFINE_float("learning_rate", 0.01, "Learning Rate")
 flags.DEFINE_float("momentum", 0.5, "SGD momentum")
-flags.DEFINE_boolean("use_function", False,
-                     "Should we wrap the step in a tf.function.")
 
 FLAGS = flags.FLAGS
 NUM_TRAIN_IMAGES = 60000
@@ -70,15 +68,13 @@ def compute_loss(logits, labels):
   return loss * (1. / FLAGS.batch_size)
 
 
-def mnist_datasets():
+def mnist_datasets(strategy):
   (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
   # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
   x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
   y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
-  # TODO(priyag): `strategy.make_numpy_iterator` can be used directly instead of
-  # converting to datasets.
-  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  train_dataset = strategy.experimental_make_numpy_dataset((x_train, y_train))
+  test_dataset = strategy.experimental_make_numpy_dataset((x_test, y_test))
   return train_dataset, test_dataset
 
 
@@ -97,7 +93,7 @@ def main(unused_argv):
   strategy = tf.distribute.MirroredStrategy(devices)
 
   with strategy.scope():
-    train_ds, test_ds = mnist_datasets()
+    train_ds, test_ds = mnist_datasets(strategy)
     train_ds = train_ds.shuffle(NUM_TRAIN_IMAGES).batch(FLAGS.batch_size)
     test_ds = test_ds.batch(FLAGS.batch_size)
 
@@ -110,55 +106,47 @@ def main(unused_argv):
     test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
         "test_accuracy", dtype=tf.float32)
 
-    def train_step(inputs):
-      images, labels = inputs
-      with tf.GradientTape() as tape:
-        logits = model(images, training=True)
+    @tf.function
+    def train_epoch(train_dist_dataset):
+      """Training Step."""
+      def step_fn(images, labels):
+        with tf.GradientTape() as tape:
+          logits = model(images, training=True)
+          loss = compute_loss(logits, labels)
+        grads = tape.gradient(loss, model.variables)
+        optimizer.apply_gradients(zip(grads, model.variables))
+        training_loss.update_state(loss)
+        training_accuracy.update_state(labels, logits)
+
+      for images, labels in train_dist_dataset:
+        strategy.experimental_run_v2(step_fn, args=(images, labels))
+
+    @tf.function
+    def test_epoch(test_dist_dataset):
+      """Testing Step."""
+      def step_fn(images, labels):
+        logits = model(images, training=False)
         loss = compute_loss(logits, labels)
-      grads = tape.gradient(loss, model.variables)
-      optimizer.apply_gradients(zip(grads, model.variables))
-      training_loss.update_state(loss)
-      training_accuracy.update_state(labels, logits)
+        test_loss.update_state(loss)
+        test_accuracy.update_state(labels, logits)
 
-    def test_step(inputs):
-      images, labels = inputs
-      logits = model(images, training=False)
-      loss = compute_loss(logits, labels)
-      test_loss.update_state(loss)
-      test_accuracy.update_state(labels, logits)
+      for images, labels in test_dist_dataset:
+        strategy.experimental_run_v2(step_fn, args=(images, labels))
 
-    train_iterator = strategy.make_dataset_iterator(train_ds)
-    test_iterator = strategy.make_dataset_iterator(test_ds)
-
-    for epoch in range(0, FLAGS.num_epochs):
-      # TODO(b/123315763): Create the tf.function outside this loop once we are
-      # able to initialize iterator in eager mode.
-      dist_train = lambda it: strategy.experimental_run(train_step, it)
-      dist_test = lambda it: strategy.experimental_run(test_step, it)
-      if FLAGS.use_function:
-        dist_train = tf.function(dist_train)
-        dist_test = tf.function(dist_test)
+    train_dist_dataset = strategy.experimental_distribute_dataset(train_ds)
+    test_dist_dataset = strategy.experimental_distribute_dataset(test_ds)
 
+    for epoch in range(FLAGS.num_epochs):
       # Train
       print("Starting epoch {}".format(epoch))
-      train_iterator.initialize()
-      while True:
-        try:
-          dist_train(train_iterator)
-        except tf.errors.OutOfRangeError:
-          break
+      train_epoch(train_dist_dataset)
       print("Training loss: {:0.4f}, accuracy: {:0.2f}%".format(
           training_loss.result(), training_accuracy.result() * 100))
       training_loss.reset_states()
       training_accuracy.reset_states()
 
       # Test
-      test_iterator.initialize()
-      while True:
-        try:
-          dist_test(test_iterator)
-        except tf.errors.OutOfRangeError:
-          break
+      test_epoch(test_dist_dataset)
       print("Test loss: {:0.4f}, accuracy: {:0.2f}%".format(
           test_loss.result(), test_accuracy.result() * 100))
       test_loss.reset_states()
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index df5e5595ccb..bbae1174e49 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -72,8 +72,9 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
   def testTrainNetwork(self, distribution, optimizer_fn,
                        use_callable_loss=True):
     with distribution.scope():
+      optimizer = optimizer_fn()
       model_fn, dataset_fn, layer = minimize_loss_example(
-          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
+          optimizer, use_bias=True, use_callable_loss=use_callable_loss)
       iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
 
       def run_step():
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index a5fead9596d..90f174d0d47 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -46,7 +46,7 @@ class ParameterServerStrategy(distribute_lib.StrategyV1):
   becomes local training where variables are assigned to local CPU or the only
   GPU. When each worker has more than one GPU, operations will be replicated on
   these GPUs. In both cases, operations are replicated but variables are not and
-  these workers share a common view for which paramater server a variable is
+  these workers share a common view for which parameter server a variable is
   assigned to.
 
   This class assumes between-graph replication will be used and works on a graph
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index e4b7b81d083..5f6bca0cbe1 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -2,12 +2,13 @@
 #   Contains ops for statistical distributions (with pdf, cdf, sample, etc...).
 #   APIs here are meant to evolve over time.
 
-package(default_visibility = [
-    "//learning/brain/contrib/bayesflow:__subpackages__",
-    "//tensorflow:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//learning/brain/contrib/bayesflow:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -94,7 +95,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "@six_archive//:six",
         "//tensorflow/contrib/learn",
-        "//tensorflow/contrib/learn:head_test",
+        "//tensorflow/contrib/learn:head_test_lib",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index a500f9fd34c..342bd5ae6d7 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index fd5a44a7975..14e97a5138f 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -1,8 +1,9 @@
 # TensorFlow code for training gradient boosted trees.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 py_library(
     name = "examples_pip",
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index a001d426fe2..2b85833c151 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
diff --git a/tensorflow/contrib/eager/python/examples/gan/BUILD b/tensorflow/contrib/eager/python/examples/gan/BUILD
index be561a1da66..aaf736c0ded 100644
--- a/tensorflow/contrib/eager/python/examples/gan/BUILD
+++ b/tensorflow/contrib/eager/python/examples/gan/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
index 35d50990421..99edc8223d0 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index 8536fdbf705..4a11f95902c 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index a80f3d210a4..f397925e9e1 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
diff --git a/tensorflow/contrib/eager/python/examples/revnet/BUILD b/tensorflow/contrib/eager/python/examples/revnet/BUILD
index a48d08b8a3a..63c0edde775 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/revnet/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
index aca0b2f05f6..d54ae37192c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
index ef683ce232b..3232644d4ff 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
index 72f1829ffc4..3b676564e4d 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/BUILD
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
diff --git a/tensorflow/contrib/eager/python/remote_test.py b/tensorflow/contrib/eager/python/remote_test.py
index fb8ae11d6f6..fc78e46a5b1 100644
--- a/tensorflow/contrib/eager/python/remote_test.py
+++ b/tensorflow/contrib/eager/python/remote_test.py
@@ -23,6 +23,7 @@ import os
 
 import numpy as np
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.contrib.eager.python import parameter_server
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
@@ -92,10 +93,11 @@ class RemoteExecutionTest(test.TestCase):
 
   def setUp(self):
     # Start the local server.
+    local_port = pywrap_tensorflow.TF_PickUnusedPortOrDie()
     context.set_server_def(
         server_def=get_server_def(
             JOB_NAME,
-            local_server_port=0,
+            local_server_port=local_port,
             remote_server_addresses=[
                 self._cached_server1_target, self._cached_server2_target
             ],
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index a888379f13e..f1cb596bce0 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -2,10 +2,9 @@ package(
     default_visibility = [
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 # PLACEHOLDER PIP REQUIREMENTS
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index ab510b86d15..f82b9e8dedd 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -2,12 +2,13 @@
 # Contains ops for factorization of data, including matrix factorization and
 # clustering.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
diff --git a/tensorflow/contrib/factorization/examples/BUILD b/tensorflow/contrib/factorization/examples/BUILD
index 363baa121ab..92bcaf870ba 100644
--- a/tensorflow/contrib/factorization/examples/BUILD
+++ b/tensorflow/contrib/factorization/examples/BUILD
@@ -1,11 +1,12 @@
 # Example TensorFlow models using factorization ops.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 tf_py_test(
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index 23d7e088d06..7b9bef2c989 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -1,11 +1,12 @@
 # OpKernels for data factorization and clustering.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 cc_library(
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index edd6f36e07c..9092f19c86e 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -2,10 +2,9 @@ package(
     default_visibility = [
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index f7b3273a4d3..9b47ec8d39a 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -1,12 +1,13 @@
 # Ops that process audio and/or video files using FFmpeg.
 # (https://www.ffmpeg.org/)
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index 5ab57ca4cd4..ca65ad45326 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -137,18 +137,17 @@ class DecodeAudioOpV2 : public OpKernel {
 
     const tensorflow::StringPiece contents = contents_tensor.scalar<string>()();
     const string file_format =
-        str_util::Lowercase(file_format_tensor.scalar<string>()());
+        absl::AsciiStrToLower(file_format_tensor.scalar<string>()());
     const int32 samples_per_second =
         samples_per_second_tensor.scalar<int32>()();
     const int32 channel_count = channel_count_tensor.scalar<int32>()();
 
     const std::set<string> valid_file_formats(
         kValidFileFormats, kValidFileFormats + TF_ARRAYSIZE(kValidFileFormats));
-    OP_REQUIRES(
-        context, valid_file_formats.count(file_format) == 1,
-        errors::InvalidArgument("file_format must be one of {",
-                                str_util::Join(valid_file_formats, ", "),
-                                "}, but was: \"", file_format, "\""));
+    OP_REQUIRES(context, valid_file_formats.count(file_format) == 1,
+                errors::InvalidArgument("file_format must be one of {",
+                                        absl::StrJoin(valid_file_formats, ", "),
+                                        "}, but was: \"", file_format, "\""));
     OP_REQUIRES(context, samples_per_second > 0,
                 errors::InvalidArgument(
                     "samples_per_second must be positive, but got: ",
@@ -220,14 +219,13 @@ class DecodeAudioOp : public OpKernel {
  public:
   explicit DecodeAudioOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("file_format", &file_format_));
-    file_format_ = str_util::Lowercase(file_format_);
+    file_format_ = absl::AsciiStrToLower(file_format_);
     const std::set<string> valid_file_formats(
         kValidFileFormats, kValidFileFormats + TF_ARRAYSIZE(kValidFileFormats));
-    OP_REQUIRES(
-        context, valid_file_formats.count(file_format_) == 1,
-        errors::InvalidArgument("file_format must be one of {",
-                                str_util::Join(valid_file_formats, ", "),
-                                "}, but was: \"", file_format_, "\""));
+    OP_REQUIRES(context, valid_file_formats.count(file_format_) == 1,
+                errors::InvalidArgument("file_format must be one of {",
+                                        absl::StrJoin(valid_file_formats, ", "),
+                                        "}, but was: \"", file_format_, "\""));
 
     OP_REQUIRES_OK(context, context->GetAttr("channel_count", &channel_count_));
     OP_REQUIRES(context, channel_count_ > 0,
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index 59bad8982dd..ec034946c5e 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -2,12 +2,13 @@
 #   Libraries and kernels for manipulating audio and video using FFmpeg.
 #   (https://www.ffmpeg.org)
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 cc_library(
diff --git a/tensorflow/contrib/ffmpeg/encode_audio_op.cc b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
index c00cccd8461..7de09e062ec 100644
--- a/tensorflow/contrib/ffmpeg/encode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
@@ -95,7 +95,7 @@ class EncodeAudioOpV2 : public OpKernel {
                     bits_per_second_tensor.shape().DebugString()));
 
     const string file_format =
-        str_util::Lowercase(file_format_tensor.scalar<string>()());
+        absl::AsciiStrToLower(file_format_tensor.scalar<string>()());
     const int32 samples_per_second =
         samples_per_second_tensor.scalar<int32>()();
     const int32 bits_per_second = bits_per_second_tensor.scalar<int32>()();
@@ -157,7 +157,7 @@ class EncodeAudioOp : public OpKernel {
  public:
   explicit EncodeAudioOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("file_format", &file_format_));
-    file_format_ = str_util::Lowercase(file_format_);
+    file_format_ = absl::AsciiStrToLower(file_format_);
     OP_REQUIRES(context, file_format_ == "wav",
                 errors::InvalidArgument("file_format arg must be \"wav\"."));
 
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 91e2954079e..f3385c07745 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -1,13 +1,14 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-package(default_visibility = [
-    "//learning/brain:__subpackages__",
-    "//tensorflow:__subpackages__",
-    "//tensorflow_model_optimization:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index 2dfbd646a65..4c8c5d90b47 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -4,6 +4,7 @@
 
 package(
     default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 package_group(
@@ -13,8 +14,6 @@ package_group(
     ],
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load(
@@ -73,14 +72,17 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:conv_2d_hdrs",
         "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
+        "//tensorflow/core/kernels:cwise_lib_hdrs",
         "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:gpu_util_hdrs",
         "//tensorflow/core/kernels:ops_util_hdrs",
         "//third_party/eigen3",
+        "@com_google_absl//absl/time",
         "@local_config_cuda//cuda:cudnn_header",
     ],
     alwayslink = 1,
@@ -101,6 +103,7 @@ tf_custom_op_library(
         "//tensorflow/core/kernels:bounds_check_lib",
         "//tensorflow/core/kernels:conv_2d_hdrs",
         "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
+        "//tensorflow/core/kernels:cwise_lib_hdrs",
         "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:gpu_util_hdrs",
         "//tensorflow/core/kernels:ops_util_hdrs",
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 9dda04f3929..c097a2e103c 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -90,8 +91,9 @@ struct Int8x4ToInt32<int8> {
 
 template <typename BiasType, typename ScaleType>
 class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
-  using T = qint8;       // conv_input and filter type
-  using TempT = qint32;  // temporary accumulator type for tensor contraction
+  using T = qint8;         // conv_input and filter type
+  using ComputeT = float;  // convert inputs to fp32 for tensor contraction
+  using TempT = float;     // temporary accumulator type for tensor contraction
 
  public:
   void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
@@ -106,7 +108,7 @@ class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
 
     // Output tensor has type T (QInt8), but we can only evaluate Int8 Tensor
     // contraction using 32-bit accumulation (QInt32).
-    Tensor temp_output(DT_QINT32, output->shape());
+    Tensor temp_output(DataTypeToEnum<TempT>::value, output->shape());
 
     constexpr int32 row_dilation = 1;
     constexpr int32 col_dilation = 1;
@@ -132,7 +134,8 @@ class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
       auto in0 = conv_input.shaped<T, 2>({conv_width, filter.dim_size(2)});
       auto in1 = filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)});
 
-      out.device(device) = in0.contract(in1, dim_pair, output_kernel);
+      out.device(device) = in0.cast<ComputeT>().contract(
+          in1.cast<ComputeT>(), dim_pair, output_kernel);
 
     } else if (filter.dim_size(0) == conv_input.dim_size(1) &&
                filter.dim_size(1) == conv_input.dim_size(2) &&
@@ -151,7 +154,8 @@ class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
       auto in0 = conv_input.shaped<T, 2>({conv_input.dim_size(0), k});
       auto in1 = filter.shaped<T, 2>({k, filter.dim_size(3)});
 
-      out.device(device) = in0.contract(in1, dim_pair, output_kernel);
+      out.device(device) = in0.cast<ComputeT>().contract(
+          in1.cast<ComputeT>(), dim_pair, output_kernel);
 
     } else {
       auto out = temp_output.tensor<TempT, 4>();
@@ -159,9 +163,9 @@ class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
       auto in1 = filter.tensor<T, 4>();
 
       // Need to swap row/col when calling Eigen.
-      out.device(device) =
-          Eigen::SpatialConvolution(in0, in1, col_stride, row_stride, padding,
-                                    col_dilation, row_dilation, output_kernel);
+      out.device(device) = Eigen::SpatialConvolution(
+          in0.cast<ComputeT>(), in1.cast<ComputeT>(), col_stride, row_stride,
+          padding, col_dilation, row_dilation, output_kernel);
     }
   }
 
@@ -219,23 +223,31 @@ class LaunchFusedConv2DBiasActivationOp<CPUDevice, qint8, BiasType, ScaleType> {
         typename TTypes<T>::UnalignedTensor output(output_base + col * stride,
                                                    num_rows);
 
-        auto conv_output_scaled =
-            conv_output.cast<ScaleType>() * conv_input_scale;
+        // TODO(ezhulenev): No-op cast optimization in Eigen cause dangling
+        // references and segfaults.
+        static_assert(std::is_same<ScaleType, TempT>::value,
+                      "Must use 'conv_output.cast<ScaleType>()'");
+        auto conv_output_scaled = conv_output * conv_input_scale;
+
         ScaleType lower_bound = (activation_mode == ActivationMode::NONE
                                      ? static_cast<ScaleType>(kMinRange)
                                      : 0);
         if (side_input_scale == 0.0f) {
-          output = (conv_output_scaled + bias)
-                       .round()
-                       .clip(lower_bound, static_cast<ScaleType>(kMaxRange))
-                       .template cast<T>();
+          output =
+              (conv_output_scaled + bias)
+                  // scalar_round_op_google uses HALF_TO_EVEN.
+                  .unaryExpr(Eigen::internal::scalar_round_op_google<float>())
+                  .clip(lower_bound, static_cast<ScaleType>(kMaxRange))
+                  .template cast<T>();
         } else {
           auto side_input_scaled =
               side_input.cast<ScaleType>() * side_input_scale;
-          output = (conv_output_scaled + bias + side_input_scaled)
-                       .round()
-                       .clip(lower_bound, static_cast<ScaleType>(kMaxRange))
-                       .template cast<T>();
+          output =
+              (conv_output_scaled + bias + side_input_scaled)
+                  // scalar_round_op_google uses HALF_TO_EVEN.
+                  .unaryExpr(Eigen::internal::scalar_round_op_google<float>())
+                  .clip(lower_bound, static_cast<ScaleType>(kMaxRange))
+                  .template cast<T>();
         }
       }
     }
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
index 04edc7593a2..640a6b00965 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test_base.py
@@ -966,6 +966,37 @@ class FusedConvInt8CPUTests(object):
       for test_param in self._test_params:
         self.runTest(test_param, apply_relu)
 
+  def testRoundingMode(self):
+    """Verify the fused convolution op uses half-to-even rounding mode."""
+    batches = 1
+    input_size = 2
+    input_channels = 1
+    output_channels = 1
+    conv_input = np.array([1, 2, 3, 4]).reshape(
+        (batches, input_size, input_size, input_channels)).astype(np.int8)
+    kernel = np.array([1]).reshape(
+        (1, 1, input_channels, output_channels)).astype(np.int8)
+    biases = np.zeros((output_channels)).astype(np.float32)
+
+    with self.session() as sess, self.test_scope():
+      actual = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+          math_ops.cast(conv_input, dtypes.qint8),
+          math_ops.cast(kernel, dtypes.qint8),
+          biases,
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          conv_input_scale=0.5,
+          side_input_scale=0.0,
+          activation_mode="None",
+          data_format="NHWC",
+          filter_format="HWIO")
+      actual_value = sess.run(actual)
+      # The convolution output scaled is [0.5, 1.0, 1.5, 2.0]. After rounding
+      # half to even, the final output is [0, 1, 2, 2].
+      self.assertTrue(
+          np.array_equal(actual_value.flatten(),
+                         np.array([0, 1, 2, 2]).astype(np.int8)))
+
 
 # Test that GPU and CPU kernels produce identical results for QInt8 data type.
 class FusedConvInt8CorrespondenceTests(object):
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 3165e007996..ddd04947e9b 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -2,11 +2,12 @@
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-package(default_visibility = [
-    "//tensorflow:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index bf8b66dcfa5..797d0cdad3e 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -1,11 +1,12 @@
 # Description:
 #   GPU Direct RDMA Out-of-Band Tensor transport for TensorFlow.
 
-package(default_visibility = [
-    "//tensorflow:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/graph_editor/BUILD b/tensorflow/contrib/graph_editor/BUILD
index 35b6e638763..180cf69b07f 100644
--- a/tensorflow/contrib/graph_editor/BUILD
+++ b/tensorflow/contrib/graph_editor/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/grid_rnn/BUILD b/tensorflow/contrib/grid_rnn/BUILD
index d0b44640667..126078ae791 100644
--- a/tensorflow/contrib/grid_rnn/BUILD
+++ b/tensorflow/contrib/grid_rnn/BUILD
@@ -2,12 +2,13 @@
 #   Contains classes to construct GridRNN cells
 #   APIs here are meant to evolve over time.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 
 py_library(
diff --git a/tensorflow/contrib/hadoop/BUILD b/tensorflow/contrib/hadoop/BUILD
index 178a8a6f084..87db7ea3b71 100644
--- a/tensorflow/contrib/hadoop/BUILD
+++ b/tensorflow/contrib/hadoop/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/hooks/BUILD b/tensorflow/contrib/hooks/BUILD
index d65b2d6026d..78fd9aaab82 100644
--- a/tensorflow/contrib/hooks/BUILD
+++ b/tensorflow/contrib/hooks/BUILD
@@ -2,12 +2,13 @@
 #   Contains `SessionRunHook`s for use with `MonitoredSession` and the
 #   wrappers around it.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
index e39c60b252a..dc0eea3f2e5 100644
--- a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
+++ b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
@@ -1,18 +1,19 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_copts",
     "tf_cc_binary",
+    "tf_copts",
+)
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//visibility:public"])
-
 tf_cc_binary(
     name = "clock_cycle_profiling",
     testonly = 1,
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index 92016e6a839..76ee1cc3b39 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -2,9 +2,10 @@
 #   Contains a tool to dump TensorFlow ops which are not supported
 #   in TensorFlow HVX runtime.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index c9d917fe20d..dfc1746f533 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains ops for image manipulation.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//visibility:public"])
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
index cf786c062ea..777399184e8 100644
--- a/tensorflow/contrib/input_pipeline/BUILD
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -17,9 +17,10 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/input_pipeline/kernels/BUILD b/tensorflow/contrib/input_pipeline/kernels/BUILD
index 797605b8fe6..64b614651a1 100644
--- a/tensorflow/contrib/input_pipeline/kernels/BUILD
+++ b/tensorflow/contrib/input_pipeline/kernels/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains kernels for the input pipeline.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 cc_library(
     name = "input_pipeline_kernels",
     srcs = ["input_pipeline_kernels.cc"],
diff --git a/tensorflow/contrib/integrate/BUILD b/tensorflow/contrib/integrate/BUILD
index 9a2c94446fd..3cb268affac 100644
--- a/tensorflow/contrib/integrate/BUILD
+++ b/tensorflow/contrib/integrate/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Integration and ODE solvers for TensorFlow.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index 7a4cab20d1a..a839693340e 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -2,12 +2,13 @@
 #   Contains the Keras API (internal TensorFlow version).
 #   Note that tf.contrib.keras has been deprecated in favor of tf.keras.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 py_library(
     name = "keras",
     srcs = [
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
index 833771eda0f..71c7bf99804 100644
--- a/tensorflow/contrib/kernel_methods/BUILD
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains kernel methods for TensorFlow.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
index 95c7001371a..f24d091c3f8 100644
--- a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
+++ b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc
@@ -42,7 +42,7 @@ Aws::Client::ClientConfiguration* InitializeDefaultClientConfig() {
     // is set with a truthy value.
     const char* load_config_env = getenv("AWS_SDK_LOAD_CONFIG");
     string load_config =
-        load_config_env ? str_util::Lowercase(load_config_env) : "";
+        load_config_env ? absl::AsciiStrToLower(load_config_env) : "";
     if (load_config == "true" || load_config == "1") {
       Aws::String config_file;
       // If AWS_CONFIG_FILE is set then use it, otherwise use ~/.aws/config.
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index fb28d6689a6..da5d8f6b4e2 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Labels for TensorFlow.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index c6f6e722a4f..46040c64d43 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -1,13 +1,14 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-package(default_visibility = [
-    "//learning/brain:__subpackages__",
-    "//tensorflow:__subpackages__",
-    "//tensorflow_model_optimization:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/layers/kernels/BUILD b/tensorflow/contrib/layers/kernels/BUILD
index 7aae09ff3e9..187a3a92d73 100644
--- a/tensorflow/contrib/layers/kernels/BUILD
+++ b/tensorflow/contrib/layers/kernels/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains kernels for layers.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 cc_library(
     name = "sparse_feature_cross_kernel",
     srcs = ["sparse_feature_cross_kernel.cc"],
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index 01893d60615..ee4b0373ef7 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -182,7 +182,7 @@ class StringCrosser {
     }
     // TODO(zakaria): this will copy the string twice, might effect
     // performance.
-    return str_util::Join(cross_vec, k_feature_separator);
+    return absl::StrJoin(cross_vec, k_feature_separator);
   }
 
  private:
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 7507e1fffa6..bb3d73f7a17 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -2922,7 +2922,7 @@ def spatial_softmax(features,
   First computes the softmax over the spatial extent of each channel of a
   convolutional feature map. Then computes the expected 2D position of the
   points of maximal activation for each channel, resulting in a set of
-  feature keypoints [x1, y1, ... xN, yN] for all N channels.
+  feature keypoints [i1, j1, ... iN, jN] for all N channels.
 
   Read more here:
   "Learning visual feature spaces for robotic manipulation with
@@ -2943,7 +2943,7 @@ def spatial_softmax(features,
     feature_keypoints: A `Tensor` with size [batch_size, num_channels * 2];
       the expected 2D locations of each channel's feature keypoint (normalized
       to the range (-1,1)). The inner dimension is arranged as
-      [x1, y1, ... xN, yN].
+      [i1, j1, ... iN, jN].
   Raises:
     ValueError: If unexpected data_format specified.
     ValueError: If num_channels dimension is unspecified.
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 1d0cac308f3..0a34e91e33f 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -1,17 +1,17 @@
 # Description:
 #   Contains TF Learn (aka Scikit Flow) sub-project with high level tensorflow API.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//engedu/ml/tf_from_scratch:__pkg__",
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
-
-package(default_visibility = [
-    "//engedu/ml/tf_from_scratch:__pkg__",
-    "//tensorflow:internal",
-])
-
 load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_library(
@@ -112,6 +112,7 @@ py_test(
     name = "data_feeder_test",
     size = "small",
     srcs = ["python/learn/learn_io/data_feeder_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -127,6 +128,7 @@ py_test(
     name = "estimators_test",
     size = "small",
     srcs = ["python/learn/estimators/estimators_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -145,6 +147,7 @@ py_test(
     name = "metric_spec_test",
     size = "small",
     srcs = ["python/learn/metric_spec_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -174,6 +177,7 @@ py_test(
     name = "export_strategy_test",
     size = "small",
     srcs = ["python/learn/export_strategy_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -185,6 +189,7 @@ py_test(
     name = "graph_actions_test",
     size = "small",
     srcs = ["python/learn/graph_actions_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
@@ -208,6 +213,7 @@ py_test(
     name = "learn_runner_test",
     size = "small",
     srcs = ["python/learn/learn_runner_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -222,6 +228,7 @@ py_test(
     name = "monitors_test",
     size = "small",
     srcs = ["python/learn/monitors_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_pip_gpu"],  # b/74437598
     deps = [
@@ -247,6 +254,7 @@ py_test(
     name = "run_config_test",
     size = "small",
     srcs = ["python/learn/estimators/run_config_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -260,6 +268,7 @@ py_test(
 py_test(
     name = "tensor_signature_test",
     srcs = ["python/learn/estimators/tensor_signature_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "manual",  # b/130760310
@@ -277,6 +286,7 @@ py_test(
     name = "estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/estimator_test.py"],
+    python_version = "PY2",
     shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
@@ -321,6 +331,7 @@ py_test(
     name = "estimator_input_test",
     size = "medium",
     srcs = ["python/learn/estimators/estimator_input_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -342,6 +353,7 @@ py_test(
     name = "logistic_regressor_test",
     size = "small",
     srcs = ["python/learn/estimators/logistic_regressor_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -362,6 +374,7 @@ py_test(
     name = "dnn_linear_combined_test",
     size = "medium",
     srcs = ["python/learn/estimators/dnn_linear_combined_test.py"],
+    python_version = "PY2",
     shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["no_oss"],  # flaky b/70524820
@@ -387,6 +400,7 @@ py_test(
     name = "head_test",
     size = "medium",
     srcs = ["python/learn/estimators/head_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["noasan"],  # times out b/63678675
@@ -417,6 +431,7 @@ py_test(
     name = "dnn_test",
     size = "medium",
     srcs = ["python/learn/estimators/dnn_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notap"],
@@ -441,6 +456,7 @@ py_test(
     name = "kmeans_test",
     size = "medium",
     srcs = ["python/learn/estimators/kmeans_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
@@ -467,6 +483,7 @@ py_test(
     name = "dynamic_rnn_estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/dynamic_rnn_estimator_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
@@ -493,6 +510,7 @@ py_test(
     name = "state_saving_rnn_estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/state_saving_rnn_estimator_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["noasan"],
@@ -517,6 +535,7 @@ py_test(
     name = "linear_test",
     size = "medium",
     srcs = ["python/learn/estimators/linear_test.py"],
+    python_version = "PY2",
     shard_count = 20,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -541,6 +560,7 @@ py_test(
     name = "debug_test",
     size = "medium",
     srcs = ["python/learn/estimators/debug_test.py"],
+    python_version = "PY2",
     shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
@@ -563,6 +583,7 @@ py_test(
     name = "composable_model_test",
     size = "medium",
     srcs = ["python/learn/estimators/composable_model_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -580,6 +601,7 @@ py_test(
     name = "svm_test",
     size = "medium",
     srcs = ["python/learn/estimators/svm_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -594,6 +616,7 @@ py_test(
     name = "grid_search_test",
     size = "small",
     srcs = ["python/learn/grid_search_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -605,6 +628,7 @@ py_test(
     name = "io_test",
     size = "small",
     srcs = ["python/learn/learn_io/io_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -618,6 +642,7 @@ py_test(
     name = "model_fn_test",
     size = "small",
     srcs = ["python/learn/estimators/model_fn_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -636,7 +661,11 @@ py_test(
     name = "multioutput_test",
     size = "small",
     srcs = ["python/learn/estimators/multioutput_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+    ],
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
@@ -648,6 +677,7 @@ py_test(
     name = "nonlinear_test",
     size = "medium",
     srcs = ["python/learn/estimators/nonlinear_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -662,6 +692,7 @@ py_test(
     name = "regression_test",
     size = "small",
     srcs = ["python/learn/estimators/regression_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -674,6 +705,7 @@ py_test(
     name = "rnn_common_test",
     size = "medium",
     srcs = ["python/learn/estimators/rnn_common_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -688,6 +720,7 @@ py_test(
     name = "ops_test",
     size = "small",
     srcs = ["python/learn/ops/ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -705,6 +738,7 @@ py_test(
     name = "seq2seq_ops_test",
     size = "small",
     srcs = ["python/learn/ops/seq2seq_ops_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -720,6 +754,7 @@ py_test(
     name = "categorical_test",
     size = "small",
     srcs = ["python/learn/preprocessing/tests/categorical_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -732,6 +767,7 @@ py_test(
     name = "categorical_vocabulary_test",
     size = "small",
     srcs = ["python/learn/preprocessing/tests/categorical_vocabulary_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -743,6 +779,7 @@ py_test(
     name = "text_test",
     size = "small",
     srcs = ["python/learn/preprocessing/tests/text_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -777,6 +814,7 @@ py_test(
     name = "pandas_io_test",
     size = "small",
     srcs = ["python/learn/learn_io/pandas_io_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -792,6 +830,7 @@ py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["python/learn/utils/export_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = [
         "manual",  # http://b/31032996
@@ -819,6 +858,7 @@ py_test(
     name = "gc_test",
     size = "small",
     srcs = ["python/learn/utils/gc_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -834,6 +874,7 @@ py_test(
     name = "saved_model_export_utils_test",
     size = "small",
     srcs = ["python/learn/utils/saved_model_export_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
@@ -854,6 +895,7 @@ py_test(
     name = "input_fn_utils_test",
     size = "small",
     srcs = ["python/learn/utils/input_fn_utils_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -867,6 +909,7 @@ py_test(
     name = "stability_test",
     size = "small",
     srcs = ["python/learn/estimators/stability_test.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -884,6 +927,7 @@ py_test(
 py_binary(
     name = "inspect_checkpoint",
     srcs = ["python/learn/utils/inspect_checkpoint.py"],
+    python_version = "PY2",
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/framework:framework_py",
diff --git a/tensorflow/contrib/learn/python/learn/datasets/BUILD b/tensorflow/contrib/learn/python/learn/datasets/BUILD
index d6a43ee3a69..c872c55c6b8 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/BUILD
+++ b/tensorflow/contrib/learn/python/learn/datasets/BUILD
@@ -1,8 +1,9 @@
 # Prepare training and testing data.
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index 4ce91a140f8..8974f85a209 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -2,12 +2,13 @@
 #   Contains library to create sequence-to-sequence models on top of TensorFlow.
 #   APIs here are meant to evolve over time.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//visibility:public"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 
 py_library(
diff --git a/tensorflow/contrib/libsvm/BUILD b/tensorflow/contrib/libsvm/BUILD
index 4dccb9be7cd..7d83dc5818a 100644
--- a/tensorflow/contrib/libsvm/BUILD
+++ b/tensorflow/contrib/libsvm/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index ec0cbf92dd2..db81ed7057d 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -2,12 +2,13 @@
 #   Contains ops to train linear models on top of TensorFlow.
 #   APIs here are meant to evolve over time.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index 83e80f25bcf..c4053ba9679 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:internal"])
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # TODO(yleon): Refactor after one we switching to the V2 kernels.
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index c51b651d1a4..4861bdab15b 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/makefile/BUILD b/tensorflow/contrib/makefile/BUILD
index 1abb46f4d41..afd6a785705 100644
--- a/tensorflow/contrib/makefile/BUILD
+++ b/tensorflow/contrib/makefile/BUILD
@@ -1,5 +1,6 @@
 # Necessary build rules for makefile build in our CI.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:private"])
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 13f84313314..ba0ea348ef8 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -109,7 +109,7 @@ $(HOST_NSYNC_LIB) \
 
 # If we're on Linux, also link in the dl library.
 ifeq ($(HOST_OS),LINUX)
-	HOST_LIBS += -ldl -lpthread
+	HOST_LIBS += -ldl -lpthread -lrt
 endif
 
 # If we're on a Pi, link in pthreads and dl
@@ -259,7 +259,7 @@ endif
 endif
 # If we're on Linux, also link in the dl library.
 ifeq ($(TARGET),LINUX)
-	LIBS += -ldl -lpthread
+	LIBS += -ldl -lpthread -lrt
 endif
 # If we're cross-compiling for the Raspberry Pi, use the right gcc.
 ifeq ($(TARGET),PI)
@@ -636,6 +636,8 @@ CORE_CC_ALL_SRCS := \
 $(ABSL_CC_SRCS) \
 tensorflow/c/c_api.cc \
 tensorflow/c/kernels.cc \
+tensorflow/c/tf_datatype.cc \
+tensorflow/c/tf_status.cc \
 tensorflow/c/tf_status_helper.cc \
 $(wildcard tensorflow/core/*.cc) \
 $(wildcard tensorflow/core/common_runtime/*.cc) \
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 7566733680c..c41513a9096 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -26,7 +26,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
   exit 1;
 fi
 
-EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
+EIGEN_URL="$(grep -o 'https://bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 GEMMLOWP_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
index 93701249cc8..0a35cb78704 100644
--- a/tensorflow/contrib/memory_stats/BUILD
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Ops that get statistics on memory allocators.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
diff --git a/tensorflow/contrib/meta_graph_transform/BUILD b/tensorflow/contrib/meta_graph_transform/BUILD
index d667b8e1449..a4228beb6e6 100644
--- a/tensorflow/contrib/meta_graph_transform/BUILD
+++ b/tensorflow/contrib/meta_graph_transform/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Utility for applying the Graph Transform tool to a MetaGraphDef.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index 858fd1ede45..9615f65ab1d 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -2,15 +2,16 @@
 #   Contains ops for evaluation metrics and summary statistics.
 #   APIs here are meant to evolve over time.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//engedu/ml/tf_from_scratch:__pkg__",
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = [
-    "//engedu/ml/tf_from_scratch:__pkg__",
-    "//tensorflow:internal",
-])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/mixed_precision/BUILD b/tensorflow/contrib/mixed_precision/BUILD
index 3dfb95e0a00..5b41eed73f3 100644
--- a/tensorflow/contrib/mixed_precision/BUILD
+++ b/tensorflow/contrib/mixed_precision/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 filegroup(
diff --git a/tensorflow/contrib/mixed_precision/python/BUILD b/tensorflow/contrib/mixed_precision/python/BUILD
index 39821399fc9..de1ac08bfe8 100644
--- a/tensorflow/contrib/mixed_precision/python/BUILD
+++ b/tensorflow/contrib/mixed_precision/python/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index ce77143e0c3..00a625ff2b8 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index 98760ea7050..01a58fdcdea 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -50,7 +50,7 @@ The pruning library allows for specification of the following hyper parameters:
 | name | string | model_pruning | Name of the pruning specification. Used for adding summaries and ops under a common tensorflow name_scope |
 | begin_pruning_step | integer | 0 | The global step at which to begin pruning |
 | end_pruning_step   | integer | -1 | The global step at which to terminate pruning. Defaults to -1 implying that pruning continues till  the training stops |
-| weight_sparsity_map | list of strings | [""] | list of weight variable name (or layer name):target sparsity pairs. Eg. [conv1:0.9,conv2/kernel:0.8]. For layers/weights not in this list, sparsity as specified by the target_sparsity hyperparameter is used. |
+| weight_sparsity_map | list of strings | [""] | list of weight variable name regex (or layer name regex):target sparsity pairs. Eg. [conv1:0.9,conv.*/kernel:0.8]. For layers/weights not in this list, sparsity as specified by the target_sparsity hyperparameter is used. |
 | threshold_decay | float | 0.0 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
 | block_height|integer | 1 | Number of rows in a block for block sparse matrices|
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
index 805a6eab236..d75211086e6 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
@@ -19,10 +19,9 @@ package(
     default_visibility = [
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 py_library(
     name = "cifar10_input",
     srcs = ["cifar10_input.py"],
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 9966f7cf798..85fcbad26c7 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -60,6 +60,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
+
 from tensorflow.contrib.model_pruning.python import pruning_utils
 from tensorflow.contrib.model_pruning.python.layers import core_layers as core
 from tensorflow.contrib.training.python.training import hparam
@@ -153,7 +155,7 @@ def get_pruning_hparams():
       the global step at which to terminate pruning. Defaults to -1 implying
       that pruning continues till the training stops
     weight_sparsity_map: list of strings
-       comma separed list of weight variable name:target sparsity pairs.
+       comma separed list of weight variable name regex:target sparsity pairs.
        For layers/weights not in this list, sparsity as specified by the
        target_sparsity hyperparameter is used.
        Eg. [conv1:0.9,conv2/kernel:0.8]
@@ -355,8 +357,8 @@ class Pruning(object):
   def _get_sparsity(self, weight_name):
     """Return target sparsity for the given layer/weight name."""
     target_sparsity = [
-        sparsity for name, sparsity in self._weight_sparsity_map.items()
-        if weight_name.find(name) != -1
+        sparsity for regexp, sparsity in self._weight_sparsity_map.items()
+        if re.match(regexp, weight_name)
     ]
     if not target_sparsity:
       return self._sparsity
diff --git a/tensorflow/contrib/nearest_neighbor/BUILD b/tensorflow/contrib/nearest_neighbor/BUILD
index 6fa76244670..4d74da7962d 100644
--- a/tensorflow/contrib/nearest_neighbor/BUILD
+++ b/tensorflow/contrib/nearest_neighbor/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Tensorflow ops for nearest neighbor queries etc.
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/nn/BUILD b/tensorflow/contrib/nn/BUILD
index e3e36c4fdf5..5fe1396bced 100644
--- a/tensorflow/contrib/nn/BUILD
+++ b/tensorflow/contrib/nn/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains deprecated ops to calculate cross entropy.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//visibility:public"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 6c85533d774..63eb73940c4 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Optimization routines.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 6e401406308..8ecc0b09f4d 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index 37674071e41..9d97b85d851 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -8,9 +8,10 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/pi_examples/label_image/Makefile b/tensorflow/contrib/pi_examples/label_image/Makefile
index 9d054a3133a..58fbd18dc3a 100644
--- a/tensorflow/contrib/pi_examples/label_image/Makefile
+++ b/tensorflow/contrib/pi_examples/label_image/Makefile
@@ -34,12 +34,14 @@ CXXFLAGS := --std=c++11 $(OPTFLAGS)
 LDFLAGS := \
 -L/usr/local/lib \
 -L$(TFLIBDIR) \
+-L$(DOWNLOADSDIR)/nsync/builds/default.linux.c++11/ \
 -Wl,--no-whole-archive
 INCLUDES := \
 -I/usr/local/include \
 -I. \
 -I$(DOWNLOADSDIR) \
 -I$(DOWNLOADSDIR)/eigen/ \
+-I$(DOWNLOADSDIR)/absl/ \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 LIBS := \
@@ -49,6 +51,7 @@ LIBS := \
 -Wl,--no-whole-archive \
 -lstdc++ \
 -lprotobuf \
+-lnsync \
 -ldl \
 -lpthread \
 -lm \
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
index c6935a093f7..97a6e69ac03 100644
--- a/tensorflow/contrib/pi_examples/label_image/label_image.cc
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <jpeglib.h>
 #include <setjmp.h>
 #include <stdio.h>
+
 #include <fstream>
 #include <vector>
 
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index 3189bb97ca3..279006843ef 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -1,8 +1,9 @@
 # `Predictor` classes provide an interface for efficient, repeated inference.
 
-package(default_visibility = ["//tensorflow/contrib/predictor:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow/contrib/predictor:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
index c167fd70189..403e9bc67c2 100644
--- a/tensorflow/contrib/proto/BUILD
+++ b/tensorflow/contrib/proto/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/proto/python/ops/BUILD b/tensorflow/contrib/proto/python/ops/BUILD
index ac09934b77d..cc5d319be27 100644
--- a/tensorflow/contrib/proto/python/ops/BUILD
+++ b/tensorflow/contrib/proto/python/ops/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:__subpackages__"])
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 # Placeholders for folks with old dependencies.
 py_library(
diff --git a/tensorflow/contrib/quantization/BUILD b/tensorflow/contrib/quantization/BUILD
index 2de10e8faef..80f0a10ec75 100644
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@@ -1,16 +1,17 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//visibility:public"])
-
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_gen_op_wrapper_py",
     "tf_custom_op_library",
+    "tf_gen_op_wrapper_py",  # @unused
 )
 
 py_library(
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 598f6d15676..8183fab5f32 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index a70f748fad6..6b94de61a60 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -185,7 +185,7 @@ def _FindFusedBatchNorms(graph):
               graph_matcher.OpTypePattern('*')])
 
   batch_norm_pattern = graph_matcher.OpTypePattern(
-      'FusedBatchNorm',
+      'FusedBatchNorm|FusedBatchNormV3',
       inputs=[
           graph_matcher.OneofPattern(
               [matmul_reshape_pattern, layer_output_pattern]), gamma_pattern,
@@ -489,8 +489,14 @@ def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor,
 
 
 @ops.RegisterGradient('FoldFusedBatchNormGrad')
-def _FoldFusedBatchNormGrad(op, unused_grad_y, grad_mean, grad_var, unused_1,
-                            unused_2):
+def _FoldFusedBatchNormGrad(op,
+                            unused_grad_y,
+                            grad_mean,
+                            grad_var,
+                            unused_1,
+                            unused_2,
+                            unused_3=None):
+  """Gradient function for the FusedBatchNorm ops matched by _GetLayerMatch."""
   x = op.inputs[0]
   n = math_ops.cast(
       array_ops.size(x) / array_ops.size(grad_mean), dtypes.float32)
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index 77b3f62e9d6..8616548bace 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import fold_batch_norms
 from tensorflow.python.client import session
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -167,7 +168,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
 
   def testFoldConv2d(self):
-    self._RunTestOverParameters(self._TestFoldConv2d)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunTestOverParameters(self._TestFoldConv2d)
 
   def testMultipleLayerConv2d(self,
                               relu=nn_ops.relu,
@@ -337,7 +339,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
 
   def testFoldConv2dUnknownShape(self):
-    self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
 
   def _TestFoldFullyConnectedLayer(
       self, relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm,
@@ -432,7 +435,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
 
   def testFoldFullyConnectedLayer(self):
-    self._RunTestOverParameters(self._TestFoldFullyConnectedLayer)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunTestOverParameters(self._TestFoldFullyConnectedLayer)
 
   def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass,
                                has_scaling, fused_batch_norm,
@@ -543,7 +547,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
 
   def testFoldDepthwiseConv2d(self):
-    self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
 
   def _TestFoldAtrousConv2d(self, relu, relu_op_name, with_bypass, has_scaling,
                             fused_batch_norm, freeze_batch_norm_delay,
@@ -660,7 +665,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
 
   def testFoldAtrousConv2d(self):
-    self._RunTestOverParameters(self._TestFoldAtrousConv2d)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunTestOverParameters(self._TestFoldAtrousConv2d)
 
   def _TestCompareFoldAndUnfolded(self,
                                   relu,
@@ -733,7 +739,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertAllClose(unfolded_backward, folded_backward, atol=1e-3)
 
   def testCompareFoldAndUnfolded(self):
-    self._RunTestOverParameters(self._TestCompareFoldAndUnfolded)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunTestOverParameters(self._TestCompareFoldAndUnfolded)
 
   def _BatchNormParams(self, scale=True, fused=False):
     return {
diff --git a/tensorflow/contrib/quantize/python/graph_matcher_test.py b/tensorflow/contrib/quantize/python/graph_matcher_test.py
index be741644b61..95849d75b61 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher_test.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher_test.py
@@ -22,6 +22,7 @@ from tensorflow.contrib.framework.python import ops as contrib_ops
 from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import graph_matcher
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -36,48 +37,51 @@ from tensorflow.python.platform import googletest
 class GraphMatcherTest(test_util.TensorFlowTestCase):
 
   def test_conv_layer(self):
-    g = ops.Graph()
-    with g.as_default():
-      inputs = array_ops.placeholder(dtypes.float32, shape=[8, 5, 5, 3])
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      g = ops.Graph()
+      with g.as_default():
+        inputs = array_ops.placeholder(dtypes.float32, shape=[8, 5, 5, 3])
 
-    with contrib_ops.arg_scope(
-        [layers.batch_norm], fused=True, is_training=True, trainable=True):
-      return layers.convolution(
-          inputs,
-          num_outputs=16,
-          kernel_size=3,
-          stride=1,
-          padding='VALID',
-          activation_fn=nn_ops.relu,
-          normalizer_fn=layers.batch_norm,
-          normalizer_params={},
-          weights_initializer=initializers.xavier_initializer(),
-          weights_regularizer=None,
-          biases_initializer=init_ops.zeros_initializer(),
-          biases_regularizer=None,
-          reuse=None,
-          trainable=True,
-          scope=None)
+      with contrib_ops.arg_scope([layers.batch_norm],
+                                 fused=True,
+                                 is_training=True,
+                                 trainable=True):
+        return layers.convolution(
+            inputs,
+            num_outputs=16,
+            kernel_size=3,
+            stride=1,
+            padding='VALID',
+            activation_fn=nn_ops.relu,
+            normalizer_fn=layers.batch_norm,
+            normalizer_params={},
+            weights_initializer=initializers.xavier_initializer(),
+            weights_regularizer=None,
+            biases_initializer=init_ops.zeros_initializer(),
+            biases_regularizer=None,
+            reuse=None,
+            trainable=True,
+            scope=None)
 
-    inputs_pattern = graph_matcher.OpTypePattern('*', name='inputs')
-    relu_pattern = graph_matcher.OpTypePattern(
-        'Relu',
-        name='relu',
-        inputs=[
-            graph_matcher.OpTypePattern(
-                'FusedBatchNorm',
-                inputs=[
-                    graph_matcher.OpTypePattern(
-                        'Conv2D', inputs=[inputs_pattern, '*']), '*', '*', '*',
-                    '*'
-                ])
-        ])
-    matcher = graph_matcher.GraphMatcher(relu_pattern)
-    match_results = list(matcher.match_graph(g))
-    self.assertEqual(1, len(match_results))
-    match_result = match_results[0]
-    self.assertEqual(match_result.get_tensor(inputs_pattern), inputs)
-    self.assertEqual(match_result.get_tensor('inputs'), inputs)
+      inputs_pattern = graph_matcher.OpTypePattern('*', name='inputs')
+      relu_pattern = graph_matcher.OpTypePattern(
+          'Relu',
+          name='relu',
+          inputs=[
+              graph_matcher.OpTypePattern(
+                  'FusedBatchNormV3',
+                  inputs=[
+                      graph_matcher.OpTypePattern(
+                          'Conv2D', inputs=[inputs_pattern, '*']), '*', '*',
+                      '*', '*'
+                  ])
+          ])
+      matcher = graph_matcher.GraphMatcher(relu_pattern)
+      match_results = list(matcher.match_graph(g))
+      self.assertEqual(1, len(match_results))
+      match_result = match_results[0]
+      self.assertEqual(match_result.get_tensor(inputs_pattern), inputs)
+      self.assertEqual(match_result.get_tensor('inputs'), inputs)
 
   def test_multiple_outputs(self):
     #   -         +
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 7c973fe5971..c2053beae33 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -577,7 +577,7 @@ def _IsSkipLayer(activation_op):
   if activation_op.type == 'Identity' and len(activation_op.outputs) == 1:
     if len(activation_op.outputs[0].consumers()) == 1:
       consumer = activation_op.outputs[0].consumers()[0]
-      if consumer.type == 'FusedBatchNorm':
+      if consumer.type in ['FusedBatchNorm', 'FusedBatchNormV3']:
         skip_layer = True
         logging.info(
             'Skipping quantizing %s, because it is the output of a conv/fc '
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index 9aa6e2c24d4..054c66be9cd 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -23,6 +23,7 @@ import functools
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import quantize_graph
 from tensorflow.python import training
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -275,7 +276,8 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       self.assertEqual(graph_def_before, graph_def_after)
 
   def testIdentityNode(self):
-    self._RunTestOverAllRewrites(self._TestIdentityNode)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunTestOverAllRewrites(self._TestIdentityNode)
 
   def _TestIdentityNode(self, rewrite_fn):
     graph = ops.Graph()
@@ -293,10 +295,11 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
 
     conv_out_identity = graph.get_operation_by_name('test/conv_out')
     self._AssertOutputGoesToOps(conv_out_identity, graph,
-                                ['test/BatchNorm/FusedBatchNorm'])
+                                ['test/BatchNorm/FusedBatchNormV3'])
 
   def testActivationQuantization(self):
-    self._RunTestOverAllRewrites(self._TestActivationQuantization)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunTestOverAllRewrites(self._TestActivationQuantization)
 
   def _TestActivationQuantization(self, rewrite_fn):
     graph = ops.Graph()
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index f6bf57a789c..26a6e35c3c6 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import fold_batch_norms
 from tensorflow.contrib.quantize.python import quantize
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -484,7 +485,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     self._AssertIdempotent(graph)
 
   def testQuantize_Conv2dWithBatchNorm(self):
-    self._RunBatchNormTestOverParameters(self._TestQuantize_Conv2dWithBatchNorm)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunBatchNormTestOverParameters(
+          self._TestQuantize_Conv2dWithBatchNorm)
 
   def _TestQuantize_Conv2dWithBatchNorm(self, activation, activation_op_name,
                                         with_bypass, delay, fused_batch_norm,
@@ -541,7 +544,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           use_resource)
 
   def testQuantize_FCWithBatchNorm(self):
-    self._RunBatchNormTestOverParameters(self._TestQuantize_FCWithBatchNorm)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunBatchNormTestOverParameters(self._TestQuantize_FCWithBatchNorm)
 
   def _TestQuantize_FCWithBatchNorm(self, activation, activation_op_name,
                                     with_bypass, delay, fused_batch_norm,
@@ -596,8 +600,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         use_resource)
 
   def testQuantize_DepthwiseConv2dWithBatchNorm(self):
-    self._RunBatchNormTestOverParameters(
-        self._TestQuantize_DepthwiseConv2dWithBatchNorm)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunBatchNormTestOverParameters(
+          self._TestQuantize_DepthwiseConv2dWithBatchNorm)
 
   def _TestQuantize_DepthwiseConv2dWithBatchNorm(
       self, activation, activation_op_name, with_bypass, delay,
@@ -654,8 +659,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           with_bypass, delay, use_resource)
 
   def testQuantize_AtrousConvWithBatchNorm(self):
-    self._RunBatchNormTestOverParameters(
-        self._TestQuantize_AtrousConvWithBatchNorm)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      self._RunBatchNormTestOverParameters(
+          self._TestQuantize_AtrousConvWithBatchNorm)
 
   def _TestQuantize_AtrousConvWithBatchNorm(
       self, activation, activation_op_name, with_bypass, delay,
@@ -723,18 +729,19 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     self.assertEqual(graph_def_before, graph_def_after)
 
   def testBatchNormForcedUpdates(self):
-    parameter_list = [
-        # (activation, activation_op_name, fused_batch_norm)
-        (nn_ops.relu6, 'Relu6', False),
-        (nn_ops.relu, 'Relu', False),
-        (array_ops.identity, 'Identity', False),
-        (nn_ops.relu6, 'Relu6', True),
-        (nn_ops.relu, 'Relu', True),
-        (array_ops.identity, 'Identity', True),
-    ]
-    for params in parameter_list:
-      self._TestBatchNormForcedUpdates(params[0], params[1], params[2], False)
-      self._TestBatchNormForcedUpdates(params[0], params[1], params[2], True)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      parameter_list = [
+          # (activation, activation_op_name, fused_batch_norm)
+          (nn_ops.relu6, 'Relu6', False),
+          (nn_ops.relu, 'Relu', False),
+          (array_ops.identity, 'Identity', False),
+          (nn_ops.relu6, 'Relu6', True),
+          (nn_ops.relu, 'Relu', True),
+          (array_ops.identity, 'Identity', True),
+      ]
+      for params in parameter_list:
+        self._TestBatchNormForcedUpdates(params[0], params[1], params[2], False)
+        self._TestBatchNormForcedUpdates(params[0], params[1], params[2], True)
 
   def _TestBatchNormForcedUpdates(self, activation, activation_op_name,
                                   fused_batch_norm, use_resource):
diff --git a/tensorflow/contrib/rate/BUILD b/tensorflow/contrib/rate/BUILD
index 4a60b4703ec..e67e62b127b 100644
--- a/tensorflow/contrib/rate/BUILD
+++ b/tensorflow/contrib/rate/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/receptive_field/BUILD b/tensorflow/contrib/receptive_field/BUILD
index 18ef0205941..0eeec09b440 100644
--- a/tensorflow/contrib/receptive_field/BUILD
+++ b/tensorflow/contrib/receptive_field/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
diff --git a/tensorflow/contrib/recurrent/BUILD b/tensorflow/contrib/recurrent/BUILD
index f9827f766da..2db92600fb7 100644
--- a/tensorflow/contrib/recurrent/BUILD
+++ b/tensorflow/contrib/recurrent/BUILD
@@ -1,8 +1,9 @@
 # Recurrent library.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/reduce_slice_ops/BUILD b/tensorflow/contrib/reduce_slice_ops/BUILD
index 02b3d66e461..d9741286e41 100644
--- a/tensorflow/contrib/reduce_slice_ops/BUILD
+++ b/tensorflow/contrib/reduce_slice_ops/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index ea4026008ed..be09076e862 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -37,7 +37,7 @@ namespace functor {
 #define GPUReduceSliceFunctorReduceop(reduceop, beginning)                     \
   template <typename T, typename Index>                                        \
   __global__ void ReduceSliceDeviceKernel##reduceop(                           \
-      Cuda3DLaunchConfig config, Index indices_width, Index bound,             \
+      Gpu3DLaunchConfig config, Index indices_width, Index bound,              \
       const T begin, const Index *indices, const T *input, T *out) {           \
     CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {               \
       CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {             \
@@ -73,7 +73,7 @@ namespace functor {
       if (sizex * sizey * sizez == 0) {                                        \
         return;                                                                \
       }                                                                        \
-      Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(                       \
+      Gpu3DLaunchConfig config = GetGpu3DLaunchConfig(                         \
           sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
           0, 0);                                                               \
                                                                                \
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index 274bdbeacf7..00552a077ff 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # Contains ops for remote fused graph
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
index bbf10996759..4e5857b0a55 100644
--- a/tensorflow/contrib/resampler/BUILD
+++ b/tensorflow/contrib/resampler/BUILD
@@ -1,9 +1,10 @@
-licenses(["notice"])  # Apache 2.0 License
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0 License
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//visibility:public"])
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
index ecb6c187a07..bdadc36bbc7 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
@@ -117,8 +117,8 @@ struct Resampler2DFunctor<GPUDevice, T> {
                   const int data_channels, const int num_sampling_points) {
     const int output_data_size =
         batch_size * num_sampling_points * data_channels;
-    ::tensorflow::CudaLaunchConfig config =
-        ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
+    ::tensorflow::GpuLaunchConfig config =
+        ::tensorflow::GetGpuLaunchConfig(output_data_size, d);
     TF_CHECK_OK(CudaLaunchKernel(
         Resampler2DKernel<T>, config.block_count, config.thread_per_block, 0,
         d.stream(), data, warp, output, batch_size, data_height, data_width,
@@ -252,20 +252,20 @@ struct ResamplerGrad2DFunctor<GPUDevice, T> {
     const int grad_data_size =
         batch_size * data_height * data_width * data_channels;
 
-    ::tensorflow::CudaLaunchConfig config =
-        ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
+    ::tensorflow::GpuLaunchConfig config =
+        ::tensorflow::GetGpuLaunchConfig(grad_warp_size, d);
     TF_CHECK_OK(::tensorflow::CudaLaunchKernel(
         SetZero<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         grad_warp_size, grad_warp));
 
-    config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
+    config = ::tensorflow::GetGpuLaunchConfig(grad_data_size, d);
     TF_CHECK_OK(::tensorflow::CudaLaunchKernel(
         SetZero<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         grad_data_size, grad_data));
 
     const int resampler_output_size =
         batch_size * num_sampling_points * data_channels;
-    config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
+    config = ::tensorflow::GetGpuLaunchConfig(resampler_output_size, d);
     TF_CHECK_OK(CudaLaunchKernel(ResamplerGrad2DKernel<T>, config.block_count,
                                  config.thread_per_block, 0, d.stream(), data,
                                  warp, grad_output, grad_data, grad_warp,
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 66fadcc16b5..4d3fc81199d 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -2,9 +2,10 @@
 #   Contains ops to train linear models on top of TensorFlow.
 #   APIs here are meant to evolve over time.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -12,12 +13,12 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_custom_op_library",
     "tf_cc_test",
-    "tf_py_test",
+    "tf_custom_op_library",
     "tf_gen_op_libs",
-    "tf_kernel_library",
     "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+    "tf_py_test",
 )
 
 cc_library(
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
index dbd311a276b..f8463c050c1 100644
--- a/tensorflow/contrib/rpc/BUILD
+++ b/tensorflow/contrib/rpc/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
index cb0b89ae55b..76f2ddc2d84 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/rpc/python/ops/BUILD b/tensorflow/contrib/rpc/python/ops/BUILD
index 84d2a1832f1..4dee58ccaa4 100644
--- a/tensorflow/contrib/rpc/python/ops/BUILD
+++ b/tensorflow/contrib/rpc/python/ops/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 969ff19eca6..173fb8f5ac9 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -16,12 +16,13 @@
 # Description:
 # SavedModel contrib libraries.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/BUILD b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
index ea4d41d43b5..9d9a39e61e1 100644
--- a/tensorflow/contrib/saved_model/cc/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
@@ -16,9 +16,10 @@
 # Description:
 # SavedModel contrib libraries for C++.
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index f42a2953ef9..4d3fdd689d7 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = [
-    "//learning/brain/google/xla/tests:__subpackages__",
-    "//tensorflow:__subpackages__",
-])
+package(
+    default_visibility = [
+        "//learning/brain/google/xla/tests:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -16,8 +17,8 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
     "tf_gen_op_libs",
-    "tf_kernel_library",
     "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
 )
 
 tf_custom_op_py_library(
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc b/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
index 3af6b1cb766..4af15095eec 100644
--- a/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
@@ -90,7 +90,7 @@ struct GatherTree<GPUDevice, T> {
     // First kernel launch to "zero" things out
     beams.device(d) = beams.constant(end_token);
 
-    CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * beam_width, d);
+    GpuLaunchConfig config = GetCudaLaunchConfig(batch_size * beam_width, d);
     TF_CHECK_OK(CudaLaunchKernel(
         GatherTreeOpKernel<T>, config.block_count, config.thread_per_block, 0,
         d.stream(), batch_size, max_time, beam_width, step_ids.data(),
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index 33f7bac8159..ab124959001 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -113,6 +113,20 @@ class Decoder(object):
     raise NotImplementedError
 
   def finalize(self, outputs, final_state, sequence_lengths):
+    """Called after decoding iterations complete.
+
+    Args:
+      outputs: RNNCell outputs (possibly nested tuple of) tensor[s] for all time
+        steps.
+      final_state: RNNCell final state (possibly nested tuple of) tensor[s] for
+        last time step.
+      sequence_lengths: 1-D `int32` tensor containing lengths of each sequence.
+
+    Returns:
+      `(final_outputs, final_state)`: `final_outputs` is an object containing
+      the final decoder output, `final_state` is a (structure of) state tensors
+      and TensorArrays.
+    """
     raise NotImplementedError
 
   @property
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 40774c2238a..97c7bb7918b 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   TensorFlow Serving session bundle.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/session_bundle/example/BUILD b/tensorflow/contrib/session_bundle/example/BUILD
index 18a075943c2..37c8656616f 100644
--- a/tensorflow/contrib/session_bundle/example/BUILD
+++ b/tensorflow/contrib/session_bundle/example/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow/contrib/session_bundle:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 # vardef("PYTHON_BIN_PATH", "/usr/bin/python")
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.cc b/tensorflow/contrib/session_bundle/session_bundle_test.cc
index 612623ae309..9e4b1c72195 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@@ -240,8 +240,8 @@ TEST(LoadSessionBundleFromPath, BasicTestRunOptionsThreadPoolInvalid) {
 
   // Expect failed session run calls with invalid run-options.
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Invalid inter_op_thread_pool: 2"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Invalid inter_op_thread_pool: 2"))
       << status.error_message();
 }
 
@@ -315,8 +315,8 @@ TEST_F(SessionBundleTest, ServingGraphEmpty) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(str_util::StrContains(status_.error_message(),
-                                    "Expected exactly one serving GraphDef"))
+  EXPECT_TRUE(absl::StrContains(status_.error_message(),
+                                "Expected exactly one serving GraphDef"))
       << status_.error_message();
 }
 
@@ -332,8 +332,8 @@ TEST_F(SessionBundleTest, ServingGraphAnyIncorrectType) {
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
   EXPECT_TRUE(
-      str_util::StrContains(status_.error_message(),
-                            "Expected Any type_url for: tensorflow.GraphDef"))
+      absl::StrContains(status_.error_message(),
+                        "Expected Any type_url for: tensorflow.GraphDef"))
       << status_.error_message();
 }
 
@@ -349,8 +349,7 @@ TEST_F(SessionBundleTest, ServingGraphAnyValueCorrupted) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(status_.error_message(), "Failed to unpack"))
+  EXPECT_TRUE(absl::StrContains(status_.error_message(), "Failed to unpack"))
       << status_.error_message();
 }
 
@@ -365,7 +364,7 @@ TEST_F(SessionBundleTest, AssetFileAnyIncorrectType) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       status_.error_message(),
       "Expected Any type_url for: tensorflow.serving.AssetFile"))
       << status_.error_message();
@@ -383,8 +382,7 @@ TEST_F(SessionBundleTest, AssetFileAnyValueCorrupted) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(status_.error_message(), "Failed to unpack"))
+  EXPECT_TRUE(absl::StrContains(status_.error_message(), "Failed to unpack"))
       << status_.error_message();
 }
 
@@ -399,8 +397,8 @@ TEST_F(SessionBundleTest, InitOpTooManyValues) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(str_util::StrContains(status_.error_message(),
-                                    "Expected exactly one serving init op"))
+  EXPECT_TRUE(absl::StrContains(status_.error_message(),
+                                "Expected exactly one serving init op"))
       << status_.error_message();
 }
 
diff --git a/tensorflow/contrib/session_bundle/signature_test.cc b/tensorflow/contrib/session_bundle/signature_test.cc
index b1ff55552e0..99b55e3c3be 100644
--- a/tensorflow/contrib/session_bundle/signature_test.cc
+++ b/tensorflow/contrib/session_bundle/signature_test.cc
@@ -35,7 +35,7 @@ namespace serving {
 namespace {
 
 static bool HasSubstr(StringPiece base, StringPiece substr) {
-  bool ok = str_util::StrContains(base, substr);
+  bool ok = absl::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
@@ -70,8 +70,8 @@ TEST(GetClassificationSignature, MissingSignature) {
   ClassificationSignature signature;
   const Status status = GetClassificationSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Expected a classification signature"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Expected a classification signature"))
       << status.error_message();
 }
 
@@ -87,8 +87,8 @@ TEST(GetClassificationSignature, WrongSignatureType) {
   ClassificationSignature signature;
   const Status status = GetClassificationSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Expected a classification signature"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Expected a classification signature"))
       << status.error_message();
 }
 
@@ -123,8 +123,8 @@ TEST(GetNamedClassificationSignature, MissingSignature) {
   const Status status =
       GetNamedClassificationSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Missing signature named \"foo\""))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Missing signature named \"foo\""))
       << status.error_message();
 }
 
@@ -142,9 +142,9 @@ TEST(GetNamedClassificationSignature, WrongSignatureType) {
   const Status status =
       GetNamedClassificationSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(
-      status.error_message(),
-      "Expected a classification signature for name \"foo\""))
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(),
+                        "Expected a classification signature for name \"foo\""))
       << status.error_message();
 }
 
@@ -177,8 +177,8 @@ TEST(GetRegressionSignature, MissingSignature) {
   RegressionSignature signature;
   const Status status = GetRegressionSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Expected a regression signature"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Expected a regression signature"))
       << status.error_message();
 }
 
@@ -194,8 +194,8 @@ TEST(GetRegressionSignature, WrongSignatureType) {
   RegressionSignature signature;
   const Status status = GetRegressionSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Expected a regression signature"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Expected a regression signature"))
       << status.error_message();
 }
 
@@ -228,8 +228,8 @@ TEST(GetNamedSignature, MissingSignature) {
   Signature signature;
   const Status status = GetNamedSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Missing signature named \"foo\""))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Missing signature named \"foo\""))
       << status.error_message();
 }
 
@@ -371,7 +371,7 @@ TEST(RunClassification, RunNotOk) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Data is gone"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Data is gone"))
       << status.error_message();
 }
 
@@ -387,8 +387,7 @@ TEST(RunClassification, TooManyOutputs) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Expected 1 output"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Expected 1 output"))
       << status.error_message();
 }
 
@@ -405,8 +404,8 @@ TEST(RunClassification, WrongBatchOutputs) {
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(),
-                            "Input batch size did not match output batch size"))
+      absl::StrContains(status.error_message(),
+                        "Input batch size did not match output batch size"))
       << status.error_message();
 }
 
@@ -452,7 +451,7 @@ TEST_F(RunRegressionTest, RunNotOk) {
   const Status status =
       RunRegression(signature_, input_tensor_, &session_, &output_tensor_);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Data is gone"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Data is gone"))
       << status.error_message();
 }
 
@@ -464,8 +463,8 @@ TEST_F(RunRegressionTest, MismatchedSizeForBatchInputAndOutput) {
       RunRegression(signature_, input_tensor_, &session_, &output_tensor_);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(),
-                            "Input batch size did not match output batch size"))
+      absl::StrContains(status.error_message(),
+                        "Input batch size did not match output batch size"))
       << status.error_message();
 }
 
@@ -491,8 +490,7 @@ TEST(GetSignatures, MissingSignature) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Expected exactly one"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Expected exactly one"))
       << status.error_message();
 }
 
@@ -506,9 +504,9 @@ TEST(GetSignatures, WrongProtoInAny) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Expected Any type_url for: "
-                                    "tensorflow.serving.Signatures"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Expected Any type_url for: "
+                                "tensorflow.serving.Signatures"))
       << status.error_message();
 }
 
@@ -523,7 +521,7 @@ TEST(GetSignatures, JunkInAny) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Failed to unpack"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Failed to unpack"))
       << status.error_message();
 }
 
@@ -570,8 +568,7 @@ TEST(GetSignatures, MultipleSignaturesNotOK) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Expected exactly one"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Expected exactly one"))
       << status.error_message();
 }
 
@@ -645,8 +642,8 @@ TEST(GetGenericSignature, WrongSignatureType) {
   const Status status =
       GetGenericSignature("generic_bindings", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "Expected a generic signature:"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "Expected a generic signature:"))
       << status.error_message();
 }
 
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index 5e4f130b314..61798014da2 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index 96e2dcecbdf..43c665d6687 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains the Slim library, including common neural networks and examples.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index f1b57361ac6..d6fe04ec410 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains packages used for creating and loading datasets.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index f19177b1881..36b1f048e79 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -1,19 +1,18 @@
 # Description:
 #   Contains typical networks definitions.
 
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 package(
     default_visibility = [
         "//tensorflow:__subpackages__",
         "//tensorflow_models:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 # Transitive dependencies of this target will be included in the pip package.
 py_library(
     name = "nets_pip",
diff --git a/tensorflow/contrib/solvers/BUILD b/tensorflow/contrib/solvers/BUILD
index 5247288d54a..0c30ab24439 100644
--- a/tensorflow/contrib/solvers/BUILD
+++ b/tensorflow/contrib/solvers/BUILD
@@ -2,12 +2,13 @@
 # Contains ops for iterative solvers for linear systems, linear least-squares
 # problems, singular value decomposition and eigenvalue decomposition.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index ed4eca1a60a..cac8818febc 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -2,12 +2,13 @@
 #   Contains ops to train linear models on top of TensorFlow.
 #   APIs here are meant to evolve over time.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//visibility:public"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index 055b04db8a5..8cd92293d9f 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   A small domain-specific language (DSL) for defining deep learning networks.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
diff --git a/tensorflow/contrib/staging/BUILD b/tensorflow/contrib/staging/BUILD
index 0c86f3db1d5..96f7066646c 100644
--- a/tensorflow/contrib/staging/BUILD
+++ b/tensorflow/contrib/staging/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 412a2c81a14..d32ccb0270b 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   Contains a Python wrapper for the StatSummarizer C++ class.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index f16d99f64c1..bbc5f7d470e 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -1,8 +1,9 @@
 # Stateless random ops
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index 4085801342b..bea00d70918 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files([
     "LICENSE",
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index a7f8819915b..e27204dc0a9 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -1,7 +1,5 @@
 # TensorFlow code for training random forests.
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
@@ -12,7 +10,10 @@ load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/BUILD b/tensorflow/contrib/tensor_forest/hybrid/BUILD
index 64176a0dd07..c5949881108 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/BUILD
+++ b/tensorflow/contrib/tensor_forest/hybrid/BUILD
@@ -1,13 +1,14 @@
 # TensorFlow code for training hybrid neural network / decision tree models.
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
index b1b1559383a..d205b255402 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
@@ -5,10 +5,9 @@ load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 DECISION_TREE_RESOURCE_DEPS = [
diff --git a/tensorflow/contrib/tensor_forest/proto/BUILD b/tensorflow/contrib/tensor_forest/proto/BUILD
index 04fd6a98395..ae5fef78b5e 100644
--- a/tensorflow/contrib/tensor_forest/proto/BUILD
+++ b/tensorflow/contrib/tensor_forest/proto/BUILD
@@ -1,11 +1,12 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-package(default_visibility = ["//visibility:public"])
-
 tf_proto_library(
     name = "fertile_stats_proto",
     srcs = ["fertile_stats.proto"],
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index 85070cfad01..c2506d0346b 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # TensorBoard module containing volatile or experimental code.
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 91b6d2614a8..d90d6af9ba3 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -3,9 +3,10 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
index 0a2cf105baf..43788306880 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
@@ -4,9 +4,10 @@
 #   APIs are meant to change while upgrading TRT.
 #   add init_py into pip package BUILD dependency to install it.
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/contrib/testing/BUILD b/tensorflow/contrib/testing/BUILD
index 8a40e111d77..258026a6bdb 100644
--- a/tensorflow/contrib/testing/BUILD
+++ b/tensorflow/contrib/testing/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 py_library(
     name = "testing_py",
     srcs = [
diff --git a/tensorflow/contrib/text/BUILD b/tensorflow/contrib/text/BUILD
index 9f9e19a7cd6..5f4e4dff3d9 100644
--- a/tensorflow/contrib/text/BUILD
+++ b/tensorflow/contrib/text/BUILD
@@ -2,12 +2,13 @@
 #   contains parts of TensorFlow that are experimental or unstable and which
 #   are not supported.
 
-package(default_visibility = [
-    "//learning/brain:__subpackages__",
-    "//tensorflow:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/contrib/tfprof/BUILD b/tensorflow/contrib/tfprof/BUILD
index e7f4ebdd36a..c8846391ccd 100644
--- a/tensorflow/contrib/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/BUILD
@@ -1,9 +1,10 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 py_library(
     name = "tfprof",
     srcs = [
diff --git a/tensorflow/contrib/timeseries/BUILD b/tensorflow/contrib/timeseries/BUILD
index 18933227b34..989085564a5 100644
--- a/tensorflow/contrib/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//tensorflow:internal",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 235f3adb92f..03979932a96 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -3,10 +3,9 @@ load("//tensorflow:tensorflow.bzl", "py_binary")
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 config_setting(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index ae2c4a5cb72..02e475367af 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -4,10 +4,9 @@ package(
     default_visibility = [
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_library(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 08eafece5d3..38c3ac4dc4d 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -5,10 +5,9 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_library(
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index a53cf2b86c0..be02f29b432 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -9,8 +9,6 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [
         "//cloud/vmm/testing/tests/tpu:__subpackages__",
@@ -23,6 +21,7 @@ package(
         "//tensorflow_models:__subpackages__",
         "//vr/perception:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 py_library(
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index e2ce77e1181..461f9856b0d 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 8f1d5ce2fdf..22635592aed 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -1,14 +1,15 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = [
-    "//tensorflow:internal",
-])
-
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index 7b2bc30e3a8..3d4123062ac 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -1,12 +1,13 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 19cb8983b68..3cfd4d6e81d 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -1,11 +1,12 @@
 # Description:
 #   Verbs RDMA communication interfaces and implementations for TensorFlow.
 
-package(default_visibility = [
-    "//tensorflow:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
diff --git a/tensorflow/contrib/verbs/verbs_util.cc b/tensorflow/contrib/verbs/verbs_util.cc
index a6333d9f362..dc5815181f1 100644
--- a/tensorflow/contrib/verbs/verbs_util.cc
+++ b/tensorflow/contrib/verbs/verbs_util.cc
@@ -44,7 +44,7 @@ void VerbsUtil::GetKeyAndStepId(const string& key_with_step_id, string& key,
   CHECK(parts.size() == 6) << "Key with step_id must have 6 parts";
   strings::safe_strto64(parts[5], &step_id);
   parts.pop_back();                        // remove step_id
-  key.assign(str_util::Join(parts, ";"));  // stitch them together
+  key.assign(absl::StrJoin(parts, ";"));   // stitch them together
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index bcd02aa8410..b07e018dd2a 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -349,6 +349,8 @@ cc_library(
     deps = [
         ":lib_platform",
         "//tensorflow/core/platform/default/build_config:base",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1000,7 +1002,6 @@ cc_library(
     name = "allocator",
     srcs = [
         "framework/allocator.cc",
-        "framework/allocator_registry.cc",
         "framework/allocator_registry.h",
         "framework/numeric_types.h",
         "framework/tracking_allocator.cc",
@@ -1012,12 +1013,37 @@ cc_library(
     ],
     features = ["parse_headers"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "//third_party/eigen3",
+    ] + if_static(extra_deps = [":allocator_registry_impl"]),
+    alwayslink = 1,
+)
+
+# This target will be included in libtensorflow_framework.so via the
+# framework_internal_impl target.
+# All other dependencies on this target need to go through if_static guard,
+# as otherwise duplicate registration in the registry will cause crashes.
+cc_library(
+    name = "allocator_registry_impl",
+    srcs = [
+        "framework/allocator.h",
+        "framework/allocator_registry.cc",
+        "framework/allocator_registry.h",
+        "framework/cpu_allocator_impl.cc",
+        "framework/numeric_types.h",
+        "framework/tracking_allocator.h",
+        "framework/type_traits.h",
+    ],
     deps = [
         ":lib",
         "//third_party/eigen3",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -1084,6 +1110,7 @@ cc_library(
         ":lib_internal",
         ":protos_all_cc",
         "//tensorflow/core/util/proto:proto_utils",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2324,6 +2351,7 @@ tf_proto_library_cc(
     srcs = ["protobuf/eager_service.proto"],
     has_services = 1,
     cc_api_version = 2,
+    cc_grpc_version = 1,
     cc_stubby_versions = ["2"],
     protodeps = tf_additional_all_protos(),
     visibility = [
@@ -2872,6 +2900,7 @@ tf_cuda_library(
             "**/*test*",
             "**/*main.cc",
             "framework/allocator.cc",
+            "framework/cpu_allocator_impl.cc",
             "framework/allocator_registry.cc",
             "framework/tracking_allocator.cc",
             "example/example_parser_configuration.*",
@@ -2905,6 +2934,7 @@ tf_cuda_library(
         ],
     }),
     deps = [
+        ":allocator_registry_impl",
         ":allocator",
         ":feature_util",
         ":lib",
@@ -3342,6 +3372,7 @@ cc_library(
         "//tensorflow/compiler:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/core/profiler:__subpackages__",
+        "//tensorflow/stream_executor:__subpackages__",
     ],
     deps = [":lib_internal"],
 )
@@ -3466,7 +3497,6 @@ GPU_RUNTIME_HEADERS = [
 tf_cuda_library(
     name = "gpu_runtime_impl",
     srcs = [
-        "common_runtime/gpu/gpu_bfc_allocator.cc",
         "common_runtime/gpu/gpu_cudamalloc_allocator.cc",
         "common_runtime/gpu/gpu_debug_allocator.cc",
         "common_runtime/gpu/gpu_device.cc",
@@ -3484,6 +3514,7 @@ tf_cuda_library(
         ":core_cpu_lib",
         ":framework",
         ":framework_internal",
+        ":gpu_bfc_allocator",
         ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
@@ -3773,6 +3804,8 @@ tf_cc_tests(
         ":test",
         ":test_main",
         "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@zlib_archive//:zlib",
     ],
 )
@@ -3788,6 +3821,7 @@ tf_cc_test(
         ":protos_all_cc",
         ":test",
         "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGradV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGradV3.pbtxt
new file mode 100644
index 00000000000..76b33b959f6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGradV3.pbtxt
@@ -0,0 +1,116 @@
+op {
+  graph_op_name: "FusedBatchNormGradV3"
+  in_arg {
+    name: "y_backprop"
+    description: <<END
+A 4D Tensor for the gradient with respect to y.
+END
+  }
+  in_arg {
+    name: "x"
+    description: <<END
+A 4D Tensor for input data.
+END
+  }
+  in_arg {
+    name: "scale"
+    description: <<END
+A 1D Tensor for scaling factor, to scale the normalized x.
+END
+  }
+  in_arg {
+    name: "reserve_space_1"
+    description: <<END
+When is_training is True, a 1D Tensor for the computed batch
+mean to be reused in gradient computation. When is_training is
+False, a 1D Tensor for the population mean to be reused in both
+1st and 2nd order gradient computation.
+END
+  }
+  in_arg {
+    name: "reserve_space_2"
+    description: <<END
+When is_training is True, a 1D Tensor for the computed batch
+variance (inverted variance in the cuDNN case) to be reused in
+gradient computation. When is_training is False, a 1D Tensor
+for the population variance to be reused in both 1st and 2nd
+order gradient computation.
+END
+  }
+  in_arg {
+    name: "reserve_space_3"
+    description: <<END
+When is_training is True, a 1D Tensor for some intermediate results to be reused
+in gradient computation. When is_training is False, a dummy empty Tensor will be
+created.
+END
+  }
+  out_arg {
+    name: "x_backprop"
+    description: <<END
+A 4D Tensor for the gradient with respect to x.
+END
+  }
+  out_arg {
+    name: "scale_backprop"
+    description: <<END
+A 1D Tensor for the gradient with respect to scale.
+END
+  }
+  out_arg {
+    name: "offset_backprop"
+    description: <<END
+A 1D Tensor for the gradient with respect to offset.
+END
+  }
+  out_arg {
+    name: "reserve_space_4"
+    description: <<END
+Unused placeholder to match the mean input in FusedBatchNorm.
+END
+  }
+  out_arg {
+    name: "reserve_space_5"
+    description: <<END
+Unused placeholder to match the variance input
+in FusedBatchNorm.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The data type for the elements of input and output Tensors.
+END
+  }
+  attr {
+    name: "U"
+    description: <<END
+The data type for the scale, offset, mean, and variance.
+END
+  }
+  attr {
+    name: "epsilon"
+    description: <<END
+A small float number added to the variance of x.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format for y_backprop, x, x_backprop.
+Either "NHWC" (default) or "NCHW".
+END
+  }
+  attr {
+    name: "is_training"
+    description: <<END
+A bool value to indicate the operation is for training (default)
+or inference.
+END
+  }
+  summary: "Gradient for batch normalization."
+  description: <<END
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedBatchNormV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormV3.pbtxt
new file mode 100644
index 00000000000..5ec6acb52af
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormV3.pbtxt
@@ -0,0 +1,112 @@
+op {
+  graph_op_name: "FusedBatchNormV3"
+  in_arg {
+    name: "x"
+    description: <<END
+A 4D Tensor for input data.
+END
+  }
+  in_arg {
+    name: "scale"
+    description: <<END
+A 1D Tensor for scaling factor, to scale the normalized x.
+END
+  }
+  in_arg {
+    name: "offset"
+    description: <<END
+A 1D Tensor for offset, to shift to the normalized x.
+END
+  }
+  in_arg {
+    name: "mean"
+    description: <<END
+A 1D Tensor for population mean. Used for inference only;
+must be empty for training.
+END
+  }
+  in_arg {
+    name: "variance"
+    description: <<END
+A 1D Tensor for population variance. Used for inference only;
+must be empty for training.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A 4D Tensor for output data.
+END
+  }
+  out_arg {
+    name: "batch_mean"
+    description: <<END
+A 1D Tensor for the computed batch mean, to be used by TensorFlow
+to compute the running mean.
+END
+  }
+  out_arg {
+    name: "batch_variance"
+    description: <<END
+A 1D Tensor for the computed batch variance, to be used by
+TensorFlow to compute the running variance.
+END
+  }
+  out_arg {
+    name: "reserve_space_1"
+    description: <<END
+A 1D Tensor for the computed batch mean, to be reused
+in the gradient computation.
+END
+  }
+  out_arg {
+    name: "reserve_space_2"
+    description: <<END
+A 1D Tensor for the computed batch variance (inverted variance
+in the cuDNN case), to be reused in the gradient computation.
+END
+  }
+  out_arg {
+    name: "reserve_space_3"
+    description: <<END
+A 1D Tensor for some intermediate results, to be reused in the gradient
+computation for better efficiency.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The data type for the elements of input and output Tensors.
+END
+  }
+  attr {
+    name: "U"
+    description: <<END
+The data type for the scale, offset, mean, and variance.
+END
+  }
+  attr {
+    name: "epsilon"
+    description: <<END
+A small float number added to the variance of x.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format for x and y. Either "NHWC" (default) or "NCHW".
+END
+  }
+  attr {
+    name: "is_training"
+    description: <<END
+A bool value to indicate the operation is for training (default)
+or inference.
+END
+  }
+  summary: "Batch normalization."
+  description: <<END
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt b/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
index 51a5fa95d3d..cc99de48438 100644
--- a/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
@@ -5,7 +5,7 @@ op {
   }
   summary: "Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN."
   description: <<END
-*NOTE*: `Mul` supports broadcasting. More about broadcasting
+*NOTE*: `MulNoNan` supports broadcasting. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SelectV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SelectV2.pbtxt
new file mode 100644
index 00000000000..e567206d913
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SelectV2.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "SelectV2"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt
deleted file mode 100644
index a305003b64b..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt
+++ /dev/null
@@ -1,100 +0,0 @@
-op {
-  graph_op_name: "TPUReplicate"
-  visibility: HIDDEN
-  in_arg {
-    name: "inputs"
-    description: <<END
-the inputs to 'computation', flattened, in replica-major order.
-END
-  }
-  in_arg {
-    name: "broadcast_inputs"
-    description: <<END
-additional arguments to broadcast to all replicas. The
-broadcast inputs are appended to the per-replica inputs when calling
-computation.
-END
-  }
-  in_arg {
-    name: "guaranteed_constants"
-    description: <<END
-arguments which have been guaranteed to not
-change their values during the session lifetime. These contain tensors marked as
-constant using the GuaranteeConstOp.
-END
-  }
-  out_arg {
-    name: "outputs"
-    description: <<END
-the outputs of 'computation'.
-END
-  }
-  attr {
-    name: "computation"
-    description: <<END
-a function containing the computation to run.
-END
-  }
-  attr {
-    name: "num_replicas"
-    description: <<END
-the number of replicas of the computation to run.
-END
-  }
-  attr {
-    name: "num_cores_per_replica"
-    description: <<END
-the number of logical cores in each replica.
-END
-  }
-  attr {
-    name: "topology"
-    description: <<END
-A serialized tensorflow.tpu.TopologyProto that describes the TPU
-topology.
-END
-  }
-  attr {
-    name: "use_tpu"
-    description: <<END
-a bool indicating if this computation will run on TPU or CPU/GPU.
-Currently, only supports a default placement (computation is placed on GPU
-if one is available, and on CPU if not).
-END
-  }
-  attr {
-    name: "device_assignment"
-    description: <<END
-a flattened array with shape
-[replica, num_cores_per_replica, mesh_dimension] that maps the coordinates
-of logical cores in each replica of a computation to physical coordinates in
-the TPU topology.
-END
-  }
-  attr {
-    name: "Tinputs"
-    description: <<END
-the types of the arguments to 'computation'.
-END
-  }
-  attr {
-    name: "Tbroadcast_inputs"
-    description: <<END
-the types of the additional arguments to broadcast to all
-replicas.
-END
-  }
-  attr {
-    name: "Tguaranteed_constants"
-    description: <<END
-the types of the arguments to 'guaranteed_constants'.
-END
-  }
-  attr {
-    name: "output_types"
-    description: <<END
-the types of the outputs of 'computation'.
-END
-  }
-  summary: "Runs replicated computations on a distributed TPU system."
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGradV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGradV3.pbtxt
new file mode 100644
index 00000000000..635ce73af78
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormGradV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNormGradV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FusedBatchNormV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormV3.pbtxt
new file mode 100644
index 00000000000..1b7be51ff6e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNormV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SelectV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SelectV2.pbtxt
new file mode 100644
index 00000000000..bf57ed7164d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SelectV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SelectV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/update_api_def.cc b/tensorflow/core/api_def/update_api_def.cc
index ea9a1482605..bb828a0f44a 100644
--- a/tensorflow/core/api_def/update_api_def.cc
+++ b/tensorflow/core/api_def/update_api_def.cc
@@ -178,7 +178,7 @@ string RemoveDoc(const OpDef& op, const string& file_contents,
   }
   // Remove .Doc call.
   auto before_doc = file_contents.substr(0, doc_start_location);
-  str_util::StripTrailingWhitespace(&before_doc);
+  absl::StripTrailingAsciiWhitespace(&before_doc);
   return before_doc +
          file_contents.substr(doc_end_location + sizeof(kDocEnd) - 1);
 }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 6c5eca3b6af..27156cfe362 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 
 #include <stddef.h>
+
 #include <algorithm>
 #include <unordered_map>
 #include <utility>
@@ -95,8 +96,11 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
 
       // Initialize group runtime details.
       CollectiveImplementationInterface* col_impl;
+      // TODO(b/128853131,b/132707282): Remove NCCL special case when we have
+      // NCCL implementations for all collectives.
       status = CollectiveRegistry::LookupParamResolverInstance(
-          GetCollectiveName(cp, nccl_), &col_impl);
+          nccl_ ? "NcclReduce" : GetCollectiveName(cp, /*nccl=*/false),
+          &col_impl);
       if (status.ok()) {
         status = col_impl->InitializeCollectiveGroupRuntimeDetails(
             &gr->group.runtime_details);
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
index b580d8819f5..b7e0cc7cd70 100644
--- a/tensorflow/core/common_runtime/colocation_graph.cc
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -64,7 +64,12 @@ std::vector<Device*> FilterSupportedDevices(
     for (Device* device : devices) {
       if (DeviceType(device->attributes().device_type()) ==
           supported_device_type.first) {
-        if (device == default_device) {
+        if (default_device &&
+            (device == default_device ||
+             // TODO(nareshmodi, fishx): At times the device pointer in the
+             // device set is different to the one passed in as the default
+             // device. Figure out why this might be.
+             device->name() == default_device->name())) {
           filtered_default_device = device;
         } else {
           prioritized_filtered_devices.emplace_back(
@@ -640,7 +645,7 @@ Status ColocationGraph::ColocateAllNodes() {
     if (attr_value != nullptr && attr_value->has_list()) {
       for (const string& class_spec : attr_value->list().s()) {
         StringPiece spec(class_spec);
-        if (str_util::ConsumePrefix(&spec, kColocationGroupPrefixStringPiece)) {
+        if (absl::ConsumePrefix(&spec, kColocationGroupPrefixStringPiece)) {
           found_spec = true;
           TF_RETURN_IF_ERROR(
               ColocateNodeToGroup(&colocation_group_root, node, spec));
@@ -1093,7 +1098,7 @@ Status ColocationGraph::GetDevicesForNode(
 
           string gpu_msg = "";
           if (!IsGoogleCudaEnabled() &&
-              str_util::Lowercase(specified_device_name.type) == "gpu") {
+              absl::AsciiStrToLower(specified_device_name.type) == "gpu") {
             gpu_msg =
                 " The requested device appears to be a GPU, but CUDA is not "
                 "enabled.";
@@ -1103,7 +1108,7 @@ Status ColocationGraph::GetDevicesForNode(
               errors::FormatNodeNameForError(node->name()),
               "was explicitly assigned to ", node->requested_device(),
               " but available devices are [ ",
-              str_util::Join(device_names, ", "), " ]. Make sure ",
+              absl::StrJoin(device_names, ", "), " ]. Make sure ",
               "the device specification refers to a valid device.", gpu_msg);
         } else if (specified_device_name.has_type) {
           return errors::InvalidArgument(
@@ -1310,7 +1315,7 @@ Status ColocationGraph::InitializeMember(const Node& node, Member* member) {
           "with these attrs: [", node.attrs().DebugString(),
           "]\n"
           "Registered devices: [",
-          str_util::Join(registered_device_types, ", "), "]\n",
+          absl::StrJoin(registered_device_types, ", "), "]\n",
           "Registered kernels:\n", KernelsRegisteredForOp(node.type_string()));
     }
 
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index 34824294338..f2996f1d8e2 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -114,70 +114,6 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
   }
 }
 
-void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
-                      Allocator* out_allocator, StringPiece edge_name,
-                      Device* src, Tensor* output,
-                      DeviceContext* send_dev_context, StatusCallback done) {
-  if (input->dtype() == DT_VARIANT) {
-    Tensor copy(cpu_allocator, DT_VARIANT, input->shape());
-    auto* status_cb = new ReffedStatusCallback(std::move(done));
-    core::ScopedUnref status_cb_unref(status_cb);
-
-    auto wrapped_done = [status_cb](const Status& s) {
-      status_cb->UpdateStatus(s);
-      status_cb->Unref();
-    };
-    auto copier = std::bind(
-        [edge_name, src, send_dev_context, out_allocator, status_cb,
-         cpu_allocator](StatusCallback wrapped_done_,
-                        // Begin unbound arguments
-                        const Tensor& from, Tensor* to) {
-          if (from.dtype() == DT_VARIANT) {
-            status_cb->Ref();
-            CopyDeviceToHost(&from, cpu_allocator, out_allocator, edge_name,
-                             src, to, send_dev_context, wrapped_done_);
-            return Status::OK();
-          } else {
-            if (!DMAHelper::CanUseDMA(&from)) {
-              Status err = errors::InvalidArgument(
-                  "During Variant Device->Host Copy: "
-                  "non-DMA-copy attempted of tensor type: ",
-                  DataTypeString(from.dtype()));
-              status_cb->UpdateStatus(err);
-              return err;
-            }
-            if (status_cb->ok()) {
-              status_cb->Ref();
-              *to = Tensor(out_allocator, from.dtype(), from.shape());
-              send_dev_context->CopyDeviceTensorToCPU(&from, edge_name, src, to,
-                                                      wrapped_done_);
-              return Status::OK();
-            } else {
-              return status_cb->status();
-            }
-          }
-        },
-        std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
-
-    const Variant* v = input->flat<Variant>().data();
-    Variant* v_out = copy.flat<Variant>().data();
-    Status s_copy_init;
-    for (int64 i = 0; i < input->NumElements(); ++i) {
-      s_copy_init = VariantDeviceCopy(
-          VariantDeviceCopyDirection::DEVICE_TO_HOST, v[i], &v_out[i], copier);
-      if (!s_copy_init.ok()) {
-        status_cb->UpdateStatus(s_copy_init);
-        break;
-      }
-    }
-    if (s_copy_init.ok()) {
-      *output = std::move(copy);
-    }
-  } else {
-    send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src, output,
-                                            std::move(done));
-  }
-}
 
 void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
                         Allocator* cpu_allocator, Allocator* out_allocator,
@@ -390,4 +326,69 @@ REGISTER_WRAPPED_TENSOR_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
 
 }  // namespace
 
+void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
+                      Allocator* out_allocator, StringPiece edge_name,
+                      Device* src, Tensor* output,
+                      DeviceContext* send_dev_context, StatusCallback done) {
+  if (input->dtype() == DT_VARIANT) {
+    Tensor copy(cpu_allocator, DT_VARIANT, input->shape());
+    auto* status_cb = new ReffedStatusCallback(std::move(done));
+    core::ScopedUnref status_cb_unref(status_cb);
+
+    auto wrapped_done = [status_cb](const Status& s) {
+      status_cb->UpdateStatus(s);
+      status_cb->Unref();
+    };
+    auto copier = std::bind(
+        [edge_name, src, send_dev_context, out_allocator, status_cb,
+         cpu_allocator](StatusCallback wrapped_done_,
+                        // Begin unbound arguments
+                        const Tensor& from, Tensor* to) {
+          if (from.dtype() == DT_VARIANT) {
+            status_cb->Ref();
+            CopyDeviceToHost(&from, cpu_allocator, out_allocator, edge_name,
+                             src, to, send_dev_context, wrapped_done_);
+            return Status::OK();
+          } else {
+            if (!DMAHelper::CanUseDMA(&from)) {
+              Status err = errors::InvalidArgument(
+                  "During Variant Device->Host Copy: "
+                  "non-DMA-copy attempted of tensor type: ",
+                  DataTypeString(from.dtype()));
+              status_cb->UpdateStatus(err);
+              return err;
+            }
+            if (status_cb->ok()) {
+              status_cb->Ref();
+              *to = Tensor(out_allocator, from.dtype(), from.shape());
+              send_dev_context->CopyDeviceTensorToCPU(&from, edge_name, src, to,
+                                                      wrapped_done_);
+              return Status::OK();
+            } else {
+              return status_cb->status();
+            }
+          }
+        },
+        std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
+
+    const Variant* v = input->flat<Variant>().data();
+    Variant* v_out = copy.flat<Variant>().data();
+    Status s_copy_init;
+    for (int64 i = 0; i < input->NumElements(); ++i) {
+      s_copy_init = VariantDeviceCopy(
+          VariantDeviceCopyDirection::DEVICE_TO_HOST, v[i], &v_out[i], copier);
+      if (!s_copy_init.ok()) {
+        status_cb->UpdateStatus(s_copy_init);
+        break;
+      }
+    }
+    if (s_copy_init.ok()) {
+      *output = std::move(copy);
+    }
+  } else {
+    send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src, output,
+                                            std::move(done));
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/copy_tensor.h b/tensorflow/core/common_runtime/copy_tensor.h
index 58794424270..8839325538a 100644
--- a/tensorflow/core/common_runtime/copy_tensor.h
+++ b/tensorflow/core/common_runtime/copy_tensor.h
@@ -69,6 +69,11 @@ class CopyTensor {
                          CopyFunction copy_function);
 };
 
+void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
+                      Allocator* out_allocator, StringPiece edge_name,
+                      Device* src, Tensor* output,
+                      DeviceContext* send_dev_context, StatusCallback done);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_
diff --git a/tensorflow/core/common_runtime/data/BUILD b/tensorflow/core/common_runtime/data/BUILD
index 124862dbb73..31d10ec1770 100644
--- a/tensorflow/core/common_runtime/data/BUILD
+++ b/tensorflow/core/common_runtime/data/BUILD
@@ -3,10 +3,9 @@ package(
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 26602a45be8..3f910102b97 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -102,7 +102,7 @@ Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
       device_names.push_back(itr.first);
     }
     VLOG(1) << "Unknown device: " << name
-            << " all devices: " << str_util::Join(device_names, ", ");
+            << " all devices: " << absl::StrJoin(device_names, ", ");
     return errors::InvalidArgument(name, " unknown device.");
   }
   *device = iter->second;
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 9361521b807..e744ca864c9 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -1346,8 +1346,8 @@ Status DirectSession::GetOrCreateExecutors(
 
   // Fast lookup path, no sorting.
   const string key = strings::StrCat(
-      str_util::Join(inputs, ","), "->", str_util::Join(outputs, ","), "/",
-      str_util::Join(target_nodes, ","), "/", run_state_args->is_partial_run,
+      absl::StrJoin(inputs, ","), "->", absl::StrJoin(outputs, ","), "/",
+      absl::StrJoin(target_nodes, ","), "/", run_state_args->is_partial_run,
       "/", debug_tensor_watches_summary);
   // Set the handle, if it's needed to log memory or for partial run.
   if (handle_name_counter_value >= 0) {
@@ -1379,8 +1379,8 @@ Status DirectSession::GetOrCreateExecutors(
   std::sort(tn_sorted.begin(), tn_sorted.end());
 
   const string sorted_key = strings::StrCat(
-      str_util::Join(inputs_sorted, ","), "->",
-      str_util::Join(outputs_sorted, ","), "/", str_util::Join(tn_sorted, ","),
+      absl::StrJoin(inputs_sorted, ","), "->",
+      absl::StrJoin(outputs_sorted, ","), "/", absl::StrJoin(tn_sorted, ","),
       "/", run_state_args->is_partial_run, "/", debug_tensor_watches_summary);
   // Set the handle, if its needed to log memory or for partial run.
   if (handle_name_counter_value >= 0) {
@@ -1549,7 +1549,7 @@ Status DirectSession::CreateGraphs(
           "Creating a partition for ", local_partition_name,
           " which doesn't exist in the list of available devices. Available "
           "devices: ",
-          str_util::Join(device_names, ","));
+          absl::StrJoin(device_names, ","));
     }
   }
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 14e1486545d..30ebdf92620 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -169,22 +169,22 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
 
     Status s = session->RunCallable(handle, {}, nullptr, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                      "`fetch_tensors` must be provided"));
+    EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                  "`fetch_tensors` must be provided"));
 
     TF_ASSERT_OK(session->ReleaseCallable(handle));
 
     std::vector<Tensor> outputs;
     s = session->RunCallable(handle, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.error_message(),
         "Attempted to run callable after handle was released"));
 
     s = session->RunCallable(handle + 1, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
     EXPECT_TRUE(
-        str_util::StrContains(s.error_message(), "No such callable handle"));
+        absl::StrContains(s.error_message(), "No such callable handle"));
   }
 }
 
@@ -260,8 +260,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(
-        str_util::StrContains(s.error_message(), "would create a cycle"));
+    EXPECT_TRUE(absl::StrContains(s.error_message(), "would create a cycle"));
   }
 
   {
@@ -275,7 +274,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown node"));
+    EXPECT_TRUE(absl::StrContains(s.error_message(), "unknown node"));
   }
 
   {
@@ -290,7 +289,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown edge"));
+    EXPECT_TRUE(absl::StrContains(s.error_message(), "unknown edge"));
   }
 
   {
@@ -305,7 +304,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsNotFound(s));
     EXPECT_TRUE(
-        str_util::StrContains(s.error_message(), "unable to find feed output"));
+        absl::StrContains(s.error_message(), "unable to find feed output"));
   }
 
   {
@@ -322,7 +321,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+    EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
   }
 
   {
@@ -337,7 +336,7 @@ TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
     Session::CallableHandle handle;
     Status s = session->MakeCallable(callable_options, &handle);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+    EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
   }
 }
 
@@ -762,7 +761,7 @@ TEST(DirectSessionTest, MultipleFeedTest) {
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, MultipleFeedTest_Callable) {
@@ -845,7 +844,7 @@ TEST(DirectSessionTest, MultipleFeedTest_Callable) {
           {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
       &handle);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, TestTensorConnectionUseTwice) {
@@ -999,7 +998,7 @@ TEST(DirectSessionTest, MultipleFeedTestSomeSyncRun) {
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs, nullptr);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "fed more than once"));
 }
 
 REGISTER_OP("ThreadID").Input("x: int64").Output("y: int64").Doc(R"doc(
@@ -1229,8 +1228,8 @@ TEST(DirectSessionTest, PartialRunMissingFeed) {
   s = session->PRun(handle, {{first_const->name(), value_11}},
                     {third_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                    "can't be computed from the feeds"));
+  EXPECT_TRUE(
+      absl::StrContains(s.error_message(), "can't be computed from the feeds"));
 }
 
 TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
@@ -1259,8 +1258,8 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
   // Fetch fourth_identity without feeds.
   s = session->PRun(handle, {}, {fourth_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                    "can't be computed from the feeds"));
+  EXPECT_TRUE(
+      absl::StrContains(s.error_message(), "can't be computed from the feeds"));
 
   // Feed switch_node:1 and fetch fourth_identity.
   s = session->PRun(handle, {{switch_node->name() + ":1", bool_value}},
@@ -2093,7 +2092,7 @@ void TestFeedAndFetchTensorsInDeviceMemoryFailsToMakeCallable(
     Session::CallableHandle handle;
     Status status = session->MakeCallable(opts, &handle);
     EXPECT_FALSE(status.ok()) << DataType_Name(dtype);
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         status.error_message(),
         strings::StrCat(
             "Cannot feed or fetch tensor 'y:0' from device ", gpu_device_name,
@@ -2109,7 +2108,7 @@ void TestFeedAndFetchTensorsInDeviceMemoryFailsToMakeCallable(
     Session::CallableHandle handle;
     Status status = session->MakeCallable(opts, &handle);
     EXPECT_FALSE(status.ok());
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         status.error_message(),
         strings::StrCat(
             "Cannot feed or fetch tensor 'x:0' from device ", gpu_device_name,
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index d8ab46e3ed4..8a33c7c2fbf 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -3,10 +3,9 @@ package(
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -49,6 +48,8 @@ tf_cuda_library(
     deps = [
         ":eager_executor",
         ":kernel_and_device",
+        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
+        "//tensorflow/core/distributed_runtime:worker_env",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
@@ -244,7 +245,7 @@ tf_cuda_library(
         "@farmhash_archive//:farmhash",
         # Only the TF_AttrType enum is required, so pull in just the C headers.
         # TODO(b/113535673): Break this dependency and avoid the C header completely.
-        "//tensorflow/c:c_api_headers",
+        "//tensorflow/c:tf_attrtype",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
@@ -266,6 +267,7 @@ tf_cc_test(
     srcs = ["attr_builder_test.cc"],
     deps = [
         ":attr_builder",
+        "//tensorflow/c:c_api",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:ops",
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 232117a97f4..d359148ebb9 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
-#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/tf_attrtype.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 31c998a670a..ebe0779cd74 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 7eafc5d2a7f..88d5b7b950d 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/errors.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
@@ -60,19 +61,21 @@ EagerContext::EagerContext(const SessionOptions& opts,
     : EagerContext(opts, default_policy, async, device_mgr.release(),
                    /*device_mgr_owned*/ true, rendezvous, nullptr) {}
 
-EagerContext::EagerContext(const SessionOptions& opts,
-                           ContextDevicePlacementPolicy default_policy,
-                           bool async, const DeviceMgr* device_mgr,
-                           bool device_mgr_owned, Rendezvous* rendezvous,
-                           const CustomKernelCreator* custom_kernel_creator)
+EagerContext::EagerContext(
+    const SessionOptions& opts, ContextDevicePlacementPolicy default_policy,
+    bool async, const DeviceMgr* device_mgr, bool device_mgr_owned,
+    Rendezvous* rendezvous, const CustomKernelCreator* custom_kernel_creator,
+    DistributedFunctionLibraryRuntime* cluster_flr,
+    std::function<Rendezvous*(const int64)> rendezvous_creator)
     : policy_(default_policy),
       devices_(device_mgr->ListDevices()),
       rendezvous_(rendezvous),
+      rendezvous_creator_(std::move(rendezvous_creator)),
       thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
           device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
           opts.config.graph_options().optimizer_options(), thread_pool_.get(),
-          nullptr, custom_kernel_creator)),
+          cluster_flr, custom_kernel_creator)),
       log_device_placement_(opts.config.log_device_placement()),
       num_active_steps_(0),
       async_default_(async),
@@ -160,7 +163,9 @@ void EagerContext::ClearCaches() {
   mutex_lock ml(cache_mu_);
   executor_.WaitForAllPendingNodes().IgnoreError();
   gtl::STLDeleteValues(&kernel_cache_);
-  gtl::STLDeleteValues(&active_functions_);
+  for (auto& entry : registered_functions_) {
+    entry.second->cached_kernel_keys->clear();
+  }
 }
 
 void EagerContext::SetThreadLocalDevicePlacementPolicy(
@@ -213,6 +218,12 @@ void EagerContext::CloseRemoteContexts() {
 EagerContext::~EagerContext() {
 #if !defined(IS_MOBILE_PLATFORM)
   ClearCaches();
+  for (auto& entry : registered_functions_) {
+    while (!entry.second->Unref()) {
+      // remove all references.
+    }
+  }
+  registered_functions_.clear();
 
   if (server_) {
     // TODO(nareshmodi): Fix this.
@@ -231,6 +242,7 @@ EagerContext::~EagerContext() {
   CloseRemoteContexts();
 #endif  // !IS_MOBILE_PLATFORM
 
+  executor_.WaitForAllPendingNodes().IgnoreError();
   rendezvous_->Unref();
 
   for (auto& thread : child_threads_) {
@@ -357,43 +369,55 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) {
 }
 
 Status EagerContext::AddFunctionDef(const FunctionDef& fdef) {
+  bool is_first_ref = false;
   {
     mutex_lock l(cache_mu_);
-    if (gtl::FindPtrOrNull(active_functions_, fdef.signature().name()) ==
-        nullptr) {
-      gtl::InsertOrUpdate(&active_functions_, fdef.signature().name(),
-                          new std::vector<Fprint128>);
+    auto* registered_function =
+        gtl::FindPtrOrNull(registered_functions_, fdef.signature().name());
+    if (registered_function == nullptr) {
+      registered_function = new RegisteredFunction;
+      registered_function->cached_kernel_keys =
+          absl::make_unique<std::vector<Fprint128>>();
+      gtl::InsertOrUpdate(&registered_functions_, fdef.signature().name(),
+                          registered_function);
     } else {
-      LOG(WARNING) << "Added two functions with the same name: "
-                   << fdef.signature().name();
+      registered_function->Ref();
     }
+    is_first_ref = registered_function->RefCountIsOne();
   }
-  {
+  if (is_first_ref) {
     mutex_lock l(functions_mu_);
     TF_RETURN_IF_ERROR(func_lib_def_.AddFunctionDef(fdef));
     // TODO(fishx): Avoid holding lock when sending RPCs.
     return MaybeRegisterFunctionRemotely(fdef);
   }
+  return Status::OK();
 }
 
 Status EagerContext::RemoveFunction(const string& func) {
+  bool is_last_ref = false;
   {
     mutex_lock l(cache_mu_);
-    auto cache_keys =
-        absl::WrapUnique(gtl::EraseKeyReturnValuePtr(&active_functions_, func));
-    if (cache_keys == nullptr) {
+    auto* registered_function = gtl::FindPtrOrNull(registered_functions_, func);
+    if (registered_function == nullptr) {
       return errors::InvalidArgument("Tried to remove non-existent function '",
                                      func, "'.");
     }
-    for (auto& key : *cache_keys) {
-      delete gtl::EraseKeyReturnValuePtr(&kernel_cache_, key);
+    is_last_ref = registered_function->RefCountIsOne();
+    if (is_last_ref) {
+      for (auto& key : *registered_function->cached_kernel_keys) {
+        delete gtl::EraseKeyReturnValuePtr(&kernel_cache_, key);
+      }
+      registered_functions_.erase(func);
     }
+    registered_function->Unref();
   }
-  {
+  if (is_last_ref) {
     mutex_lock l(functions_mu_);
     // TODO(fishx): Remove remote function as well.
     return func_lib_def_.RemoveFunction(func);
   }
+  return Status::OK();
 }
 
 KernelAndDevice* EagerContext::GetCachedKernel(Fprint128 cache_key) {
@@ -405,10 +429,11 @@ void EagerContext::AddKernelToCache(Fprint128 cache_key,
                                     KernelAndDevice* kernel) {
   mutex_lock ml(cache_mu_);
   gtl::InsertOrUpdate(&kernel_cache_, cache_key, kernel);
-  auto* keys = gtl::FindPtrOrNull(active_functions_, kernel->name());
+  auto* registered_function =
+      gtl::FindPtrOrNull(registered_functions_, kernel->name());
   // The kernel name can be either a primitive op or a function.
-  if (keys != nullptr) {
-    keys->emplace_back(cache_key);
+  if (registered_function != nullptr) {
+    registered_function->cached_kernel_keys->emplace_back(cache_key);
   }
 }
 
@@ -512,11 +537,13 @@ Status EagerContext::StoreCollectiveOpsServer(
 }
 
 Status EagerContext::InitializeRemote(
-    std::unique_ptr<ServerInterface> server,
+    std::unique_ptr<ServerInterface> server, WorkerEnv* worker_env,
+    std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DeviceMgr> remote_device_manager,
     const gtl::FlatMap<string, uint64>& remote_contexts, Rendezvous* r,
-    DeviceMgr* local_device_mgr, int keep_alive_secs) {
+    DeviceMgr* local_device_mgr, int keep_alive_secs,
+    DistributedFunctionLibraryRuntime* cluster_flr) {
   mutex_lock l(remote_state_mu_);
 
   if (!remote_contexts_.empty()) {
@@ -531,7 +558,7 @@ Status EagerContext::InitializeRemote(
   local_device_manager_ = nullptr;
   pflr_.reset(new ProcessFunctionLibraryRuntime(
       local_unowned_device_manager_, env_, TF_GRAPH_DEF_VERSION, &func_lib_def_,
-      {}, thread_pool_.get()));
+      {}, thread_pool_.get(), cluster_flr));
 
   devices_ = local_unowned_device_manager_->ListDevices();
   devices_map_.clear();
@@ -547,6 +574,8 @@ Status EagerContext::InitializeRemote(
   }
 
   server_ = std::move(server);
+  worker_env_ = worker_env;
+  worker_session_ = worker_session;
   remote_eager_workers_ = std::move(remote_eager_workers);
 
   active_remote_contexts_.clear();
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 9bb6e545032..1e16f6bd8d2 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -35,11 +35,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/platform/env.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
 #endif  // !IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -86,11 +89,12 @@ class EagerContext : public core::RefCounted {
                std::unique_ptr<const DeviceMgr> device_mgr,
                Rendezvous* rendezvous);
 
-  EagerContext(const SessionOptions& opts,
-               ContextDevicePlacementPolicy default_policy, bool async,
-               const DeviceMgr* device_mgr, bool device_mgr_owned,
-               Rendezvous* rendezvous,
-               const CustomKernelCreator* custom_kernel_creator);
+  EagerContext(
+      const SessionOptions& opts, ContextDevicePlacementPolicy default_policy,
+      bool async, const DeviceMgr* device_mgr, bool device_mgr_owned,
+      Rendezvous* rendezvous, const CustomKernelCreator* custom_kernel_creator,
+      DistributedFunctionLibraryRuntime* cluster_flr = nullptr,
+      std::function<Rendezvous*(const int64)> rendezvous_creator = nullptr);
 
   ~EagerContext();
 
@@ -166,6 +170,26 @@ class EagerContext : public core::RefCounted {
   bool LogMemory() const { return log_memory_; }
 
   Rendezvous* GetRendezvous() const { return rendezvous_; }
+  Rendezvous* CreateRendezvous(const int64 step_id) const {
+    if (rendezvous_creator_ != nullptr) {
+      return rendezvous_creator_(step_id);
+    }
+
+#if !defined(IS_MOBILE_PLATFORM)
+    if (worker_env_ != nullptr && worker_env_->rendezvous_mgr != nullptr) {
+      auto* remote_r = worker_env_->rendezvous_mgr->Find(step_id);
+      remote_r->Initialize(worker_session_.get()).IgnoreError();
+      return remote_r;
+    }
+#endif
+
+    if (remote_device_mgr() == nullptr) {
+      return new IntraProcessRendezvous(local_device_mgr());
+    }
+
+    return nullptr;
+  }
+
   CollectiveExecutorMgrInterface* collective_executor_mgr() {
     return (collective_executor_mgr_ != nullptr)
                ? collective_executor_mgr_.get()
@@ -224,11 +248,13 @@ class EagerContext : public core::RefCounted {
   // (should contain no local devices).
   // - remote_contexts: A map containing task name to remote context ID.
   Status InitializeRemote(
-      std::unique_ptr<ServerInterface> server,
+      std::unique_ptr<ServerInterface> server, WorkerEnv* worker_env,
+      std::shared_ptr<WorkerSession> worker_session,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DeviceMgr> remote_device_manager,
       const gtl::FlatMap<string, uint64>& remote_contexts, Rendezvous* r,
-      DeviceMgr* local_device_mgr, int keep_alive_secs);
+      DeviceMgr* local_device_mgr, int keep_alive_secs,
+      DistributedFunctionLibraryRuntime* cluster_flr);
 
   bool HasActiveRemoteContext(uint64 context_id) {
     return active_remote_contexts_.find(context_id) !=
@@ -274,6 +300,7 @@ class EagerContext : public core::RefCounted {
   // All devices are not owned.
   gtl::FlatMap<string, Device*, StringPieceHasher> devices_map_;
   Rendezvous* rendezvous_;
+  std::function<Rendezvous*(const int64)> rendezvous_creator_;
 
   mutex functions_mu_;
   FunctionLibraryDefinition func_lib_def_ GUARDED_BY(functions_mu_){
@@ -289,9 +316,14 @@ class EagerContext : public core::RefCounted {
   std::function<void(std::function<void()>)> runner_;
 
   mutex cache_mu_;
+  struct RegisteredFunction : public core::RefCounted {
+    ~RegisteredFunction() override {}
+
+    std::unique_ptr<std::vector<Fprint128>> cached_kernel_keys;
+  };
   std::unordered_map<Fprint128, KernelAndDevice*, Fprint128Hasher> kernel_cache_
       GUARDED_BY(cache_mu_);
-  std::unordered_map<string, std::vector<Fprint128>*> active_functions_
+  std::unordered_map<string, RegisteredFunction*> registered_functions_
       GUARDED_BY(cache_mu_);
 
   // Whether we should compute RunMetadata.
@@ -330,6 +362,8 @@ class EagerContext : public core::RefCounted {
   // Therefore the server_ object is not marked as const (even though it should
   // be).
   std::unique_ptr<ServerInterface> server_;
+  WorkerEnv* worker_env_ = nullptr;
+  std::shared_ptr<WorkerSession> worker_session_;
   std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;
 
   mutex remote_state_mu_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 646073b9f08..e2a759a652c 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -338,7 +338,11 @@ Status AddInputDevicesToCacheKey(const EagerContext* ctx,
   Device* cpu_device = ctx->HostCPU();
   for (TensorHandle* tensor_handle : op->Inputs()) {
     string device_name;
-    if (tensor_handle->dtype == DT_RESOURCE) {
+    if (tensor_handle->IsRemote()) {
+      Device* device = tensor_handle->device();
+      device_name = device != nullptr ? device->name() : cpu_device->name();
+      input_dev_ptrs->push_back(device == nullptr ? cpu_device : device);
+    } else if (tensor_handle->dtype == DT_RESOURCE) {
       // Use the resource's actual device because it is the device that will
       // influence partitioning the multi-device function.
       const Tensor* tensor;
@@ -534,6 +538,22 @@ Status EagerLocalExecute(EagerOperation* op,
   std::unordered_map<int, std::pair<DataType, TensorShape>>
       input_resource_variable_dtypes_and_shapes;
   if (is_multi_device_function) {
+    // All inputs need to be on local devices.
+    // TODO(nareshmodi): This is a limitation of the current code base (but
+    // should be possible to get around).
+    // Code changes will need to be made to pass input objects to the
+    // function library runtime instead of just "Tensor"s.
+    // Once that is the case, we will be able to write a thin wrapper layer over
+    // the EagerService that behaves similar to the current
+    // ClusterFunctionLibraryRuntime/DistributedFunctionLibraryRuntime.
+    for (int i = 0; i < op->Inputs().size(); i++) {
+      TensorHandle* input = op->Inputs()[i];
+      if (input->IsRemote()) {
+        TF_RETURN_IF_ERROR(EagerCopyToDevice(
+            input, ctx, device == nullptr ? "" : device->name().c_str(),
+            &(*op->MutableInputs())[i]));
+      }
+    }
     TF_RETURN_IF_ERROR(
         AddInputDevicesToCacheKey(ctx, op, &input_dev_ptrs, &cache_key));
     TF_RETURN_IF_ERROR(AddInputTensorShapesToCacheKey(
@@ -604,7 +624,10 @@ Status EagerLocalExecute(EagerOperation* op,
           flr, ctx->pflr(), std::move(input_dev_ptrs),
           std::move(input_tensor_shapes),
           std::move(input_resource_variable_dtypes_and_shapes), runner,
-          ctx->GetCollectiveExecutorHandle(), ctx->HostCPU(), op->Name());
+          ctx->GetCollectiveExecutorHandle(), ctx->HostCPU(), op->Name(),
+          [ctx](const int64 step_id) {
+            return ctx->CreateRendezvous(step_id);
+          });
     } else {
       VLOG(2) << "Running " << ndef.op() << " using op kernel. "
               << "compile_with_xla=" << compile_with_xla
@@ -692,6 +715,7 @@ std::function<void()> GetRemoteTensorDestructor(
   ctx->Ref();
   return [ctx, eager_client, context_id, op_id, output_num]() {
     auto cleanup = gtl::MakeCleanup([ctx]() { ctx->Unref(); });
+
     if (!ctx->HasActiveRemoteContext(context_id)) {
       // This means that this tensor was pointing to a remote device, which
       // has been changed out from under us. Simply return since there is
@@ -740,7 +764,7 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
 #if defined(IS_MOBILE_PLATFORM)
   return errors::Unimplemented(
       "Eager's remote execution is not available on mobile devices.");
-#else  // !IS_MOBILE_PLATFORM
+#else   // !IS_MOBILE_PLATFORM
   eager::EagerClient* eager_client;
   uint64 context_id;
   TF_RETURN_IF_ERROR(
@@ -805,7 +829,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 #if defined(IS_MOBILE_PLATFORM)
   return errors::Unimplemented(
       "Eager's remote execution is not available on mobile devices.");
-#else  // !IS_MOBILE_PLATFORM
+#else   // !IS_MOBILE_PLATFORM
   EagerContext* ctx = op->EagerContext();
 
   eager::EagerClient* eager_client;
@@ -873,6 +897,8 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   if (is_async) {
     remote_node_id = op->EagerContext()->NextId();
   }
+  VLOG(4) << "Execute remote eager op: " << op->Name()
+          << " (is async?: " << is_async << ").";
 
   const tensorflow::uint64 id = remote_op->id();
   for (int i = 0; i < *num_retvals; i++) {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index c5ffde10d09..f0642ae4318 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -132,6 +133,8 @@ Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
     // ops, we rely on Grappler to do the correct graph pruning.
     optimization_options.allow_pruning_stateful_and_dataset_ops = true;
 
+    optimization_options.is_eager_mode = true;
+
     // All the nested function calls will be executed and optimized via
     // PartitionedCallOp, there is no need to optimize functions now.
     optimization_options.optimize_function_library = false;
@@ -325,10 +328,11 @@ Status KernelAndDeviceFunc::Run(
     std::vector<Tensor>* outputs, NodeExecStats* stats, StepStats* step_stats,
     GraphCollector* graph_collector) {
   FunctionLibraryRuntime::Options opts;
+
   // We don't pass rendezvous from eager context because we can get tensor
   // name collisions in send/recv ops when running multiple instances
   // of the same multi-device function concurrently.
-  Rendezvous* rendezvous = new IntraProcessRendezvous(pflr_->device_mgr());
+  Rendezvous* rendezvous = rendezvous_creator_(opts.step_id);
   opts.rendezvous = rendezvous;
   opts.create_rendezvous = false;
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 0539a4e2d87..5c4443dc3cb 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -200,7 +200,8 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
           input_resource_dtypes_and_shapes,
       std::function<void(std::function<void()>)>* runner,
       std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
-      Device* host_cpu_device, const string& name)
+      Device* host_cpu_device, const string& name,
+      std::function<Rendezvous*(const int64)> rendezvous_creator)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
                         host_cpu_device),
         pflr_(pflr),
@@ -209,7 +210,8 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
         input_tensor_shapes_(std::move(input_tensor_shapes)),
         input_resource_dtypes_and_shapes_(
             std::move(input_resource_dtypes_and_shapes)),
-        name_(name) {}
+        name_(name),
+        rendezvous_creator_(std::move(rendezvous_creator)) {}
 
   virtual ~KernelAndDeviceFunc();
 
@@ -253,6 +255,8 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   DataTypeVector input_dtypes_;
   DataTypeVector output_dtypes_;
   string name_;
+
+  std::function<Rendezvous*(const int64)> rendezvous_creator_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 652645a8d25..c38097e4b63 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -372,7 +372,11 @@ Device* GetResourceDevice(const Tensor& t, EagerContext* ctx) {
   const ResourceHandle& resource_handle = t.flat<ResourceHandle>()(0);
   const auto& map = *ctx->device_map();
   auto it = map.find(resource_handle.device());
-  DCHECK(it != map.end());
+  if (it == map.end()) {
+    LOG(ERROR) << "Cannot find resouce device: " << resource_handle.device()
+               << ".";
+    return nullptr;
+  }
   return it->second;
 }
 
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index ff64201c80f..f306edca5a5 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -197,6 +197,11 @@ class ExecutorBarrier {
         error_rendez->Ref();
       }
 
+      if (!s.ok() && !StatusGroup::IsDerived(s) &&
+          !status_group_.HasLogMessages()) {
+        status_group_.AttachLogMessages();
+      }
+
       status_group_.Update(s);
 
       // If this is the last call to WhenDone, call the final callback
diff --git a/tensorflow/core/common_runtime/executor_factory.cc b/tensorflow/core/common_runtime/executor_factory.cc
index ee7c7c3a738..f2973aaf966 100644
--- a/tensorflow/core/common_runtime/executor_factory.cc
+++ b/tensorflow/core/common_runtime/executor_factory.cc
@@ -54,7 +54,7 @@ const string RegisteredFactoriesErrorMessageLocked()
     factory_types.push_back(executor_factory.first);
   }
   return strings::StrCat("Registered factories are {",
-                         str_util::Join(factory_types, ", "), "}.");
+                         absl::StrJoin(factory_types, ", "), "}.");
 }
 }  // namespace
 
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 822dad751c1..74f096b67e5 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 // See core/kernels/function_ops.cc for related kernels.
 
@@ -511,6 +512,12 @@ class CallOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
+    profiler::TraceMe trace_me(
+        [&] {
+          return absl::StrCat("CallOp #parent_step_id=", ctx->step_id(),
+                              ",function_step_id=", opts.step_id, "#");
+        },
+        /*level=*/2);
     lib->Run(opts, handle_, args, rets,
              [ctx, done, rets](const Status& status) {
                if (!status.ok()) {
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index d6e3b6784ce..46691aa42f1 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -63,7 +63,7 @@ Status GetOpSig(const string& op, const OpDef** sig) {
 }
 
 void HasError(const Status& s, StringPiece substr) {
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
@@ -254,8 +254,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsNotFound(status2))
         << "Actual status: " << status2.ToString();
-    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
-    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
+    EXPECT_TRUE(absl::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(absl::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
@@ -324,8 +324,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsNotFound(status2));
-    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
-    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
+    EXPECT_TRUE(absl::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(absl::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 57c7ffc93e6..81432918343 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -150,8 +150,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsNotFound(status2));
-    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
-    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
+    EXPECT_TRUE(absl::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(absl::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 06b97073929..c284958ee9f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 
-#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index cc422038850..8a247b0aaae 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -75,6 +75,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 
 #if !defined(PLATFORM_GOOGLE)
 #if GOOGLE_CUDA
@@ -430,7 +431,7 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   string gpu_thread_mode;
   TF_RETURN_IF_ERROR(
       ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
-  gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
+  gpu_thread_mode = absl::AsciiStrToLower(gpu_thread_mode);
   if (gpu_thread_mode != "global") {
     int64 gpu_thread_count = -1;
     // Default to two threads. One for device compute and another for memory
@@ -1655,6 +1656,16 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
 #endif
   }
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  // Try to dlopen GPU libraries if they are supposed to be dynamically loaded.
+  auto handle_or = se::internal::DsoLoader::MaybeTryDlopenGPULibraries();
+  if (!handle_or.ok()) {
+    LOG(WARNING) << "Cannot dlopen some GPU libraries. Skipping registering "
+                    "GPU devices...";
+    return Status::OK();
+  }
+#endif
+
 #if GOOGLE_CUDA
   auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
   if (cuda_supported_capabilities.empty()) {
@@ -1749,8 +1760,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     std::vector<int> raw_ids(ids->size());
     std::transform(ids->begin(), ids->end(), raw_ids.begin(),
                    [](PlatformGpuId id) -> int { return id.value(); });
-    LOG(INFO) << "Adding visible gpu devices: "
-              << str_util::Join(raw_ids, ", ");
+    LOG(INFO) << "Adding visible gpu devices: " << absl::StrJoin(raw_ids, ", ");
   }
 
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 3741f78a8b1..2914859a339 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -55,7 +55,7 @@ Status GetComputeCapability(PlatformGpuId gpu_id, int* cc_major,
 }
 
 void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 }  // namespace
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index c0e8ac467f4..d2565a5a4b3 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -100,7 +100,7 @@ Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
   }
   if (!DMAHelper::CanUseDMA(&src)) {
     return errors::Internal("GPU copy from non-DMA ",
-                            DataTypeString(src.dtype()), "tensor");
+                            DataTypeString(src.dtype()), " tensor");
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index eb5e6daa6a1..156a39e1689 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -38,7 +38,8 @@ void GraphOptimizer::Optimize(
     std::unique_ptr<Graph>* graph,
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
         shape_map,
-    const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn) {
+    const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn,
+    bool inline_multi_device_functions) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -89,12 +90,14 @@ void GraphOptimizer::Optimize(
     if (opts_.do_function_inlining()) {
       ExpandInlineFunctionsOptions expand_inline_opts;
       expand_inline_opts.native_options.override_device = true;
-      // GraphOptimizer is running:
-      //   (1) After partitioning when executing with a Session API.
-      //   (2) For a single device function body after instantiation.
-      // We can't inline multi-device functions in these cases, because it might
-      // lead to multiple device assignments.
-      expand_inline_opts.multi_device_options.disable_inlining = true;
+      if (!inline_multi_device_functions) {
+        // GraphOptimizer is running:
+        //   (1) After partitioning when executing with a Session API.
+        //   (2) For a single device function body after instantiation.
+        // We can't inline multi-device functions in these cases, because it
+        // might lead to multiple device assignments.
+        expand_inline_opts.multi_device_options.disable_inlining = true;
+      }
 
       bool was_mutated = ExpandInlineFunctions(runtime, g, expand_inline_opts);
       if (was_mutated) {
@@ -119,7 +122,8 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
                               std::unique_ptr<Graph>* graph,
                               const Options& options) {
   Optimize(runtime, env, device, graph, options.shape_map,
-           options.cse_consider_fn, options.cf_consider_fn);
+           options.cse_consider_fn, options.cf_consider_fn,
+           options.inline_multi_device_functions);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index e65297625ba..4d796d2873e 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -46,6 +46,10 @@ class GraphOptimizer {
     // If not null then only nodes for which cf_consider_fn returns true will be
     // considered for CF.
     NodePredicate cf_consider_fn = nullptr;
+
+    // If true, multi-device functions will be inlined if
+    // opts_.do_function_inlining() is true.
+    bool inline_multi_device_functions = false;
   };
 
   explicit GraphOptimizer(const OptimizerOptions& opts);
@@ -66,7 +70,8 @@ class GraphOptimizer {
       const std::unordered_map<string, std::vector<PartialTensorShape>>*
           shape_map,
       const NodePredicate& cse_consider_fn = nullptr,
-      const NodePredicate& cf_consider_fn = nullptr);
+      const NodePredicate& cf_consider_fn = nullptr,
+      bool inline_multi_device_functions = false);
 
   const OptimizerOptions& options() { return opts_; }
 
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index da081b82a3f..b1a79f29c8d 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -86,7 +86,7 @@ Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
   // Precondition: device_names must be sorted so that all devices in
   // the same task are adjacent.
   VLOG(2) << "Sorted task names: "
-          << str_util::Join(col_params->instance.task_names, ", ");
+          << absl::StrJoin(col_params->instance.task_names, ", ");
   std::vector<int> dev_per_task;
   const string* prior_task_name = &col_params->instance.task_names[0];
   int dev_count = 1;
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index bdd6c0e87d4..41db1b4a38d 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -51,7 +51,7 @@ Benchmark::Benchmark(const string& device, Graph* g,
   }
 
   testing::StopTiming();
-  string t = str_util::Uppercase(device);
+  string t = absl::AsciiStrToUpper(device);
   // Allow NewDevice to allocate a new threadpool with different number of
   // threads for each new benchmark.
   LocalDevice::set_use_global_threadpool(false);
diff --git a/tensorflow/core/common_runtime/lower_functional_ops_test.cc b/tensorflow/core/common_runtime/lower_functional_ops_test.cc
index a8d58d28d70..9bef90a01e4 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops_test.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops_test.cc
@@ -42,7 +42,7 @@ constexpr const char* const kLowerUsingSwitchMergeAttr =
     LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr;
 
 static void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  ASSERT_TRUE(str_util::StrContains(s, expected))
+  ASSERT_TRUE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index ccec6c0cc28..ac28f7e85ca 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -741,8 +741,9 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   for (const auto& pair : subgraphs) {
     i += 1;
     const string& target = pair.first;
-    FunctionLibraryRuntime* target_flr = GetFLR(target);
-    const string& device_type = target_flr->device()->device_type();
+
+    const string& device_type =
+        device_set_.FindDeviceByName(target)->device_type();
     Graph* subgraph = pair.second.get();
 
     ComponentFunctionData* comp_data = &data->glue_[target];
@@ -793,6 +794,19 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
 
     const string& target = pair.first;
     FunctionLibraryRuntime* target_flr = GetFLR(target);
+    if (target_flr == nullptr) {
+      if (!comp_data.ret_indices_.empty()) {
+        return errors::Unimplemented(
+            "Currently, outputting tensors on remote devices is not supported. "
+            "The ",
+            comp_data.ret_indices_[0],
+            "-th return value of the function outputs to target_device: ",
+            target,
+            " Please copy the tensor to local device explicitly using "
+            "tf.identity and return the new Tensor instead.");
+      }
+      continue;
+    }
     Device* target_device = target_flr->device();
     const FunctionBody* fbody = target_flr->GetFunctionBody(comp_data.handle_);
     DCHECK(fbody != nullptr);
@@ -876,21 +890,26 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(1) << "Running component function on device " << target
               << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
-      flr->Run(
-          opts_copy, handle, comp_args, comp_rets,
-          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
-            if (!status.ok()) {
-              VLOG(2) << "Component function execution failed: " << status;
-              refcounted_done->UpdateStatus(status);
-            } else {
-              for (int i = 0; i < comp_rets->size(); ++i) {
-                (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
-              }
-            }
-            delete comp_rets;
-            // refcounted_done is thread-safe
-            refcounted_done->Unref();
-          });
+
+      flr->Run(opts_copy, handle, comp_args, comp_rets,
+               [comp_rets, rets, comp_data, refcounted_done,
+                data](const Status& status) {
+                 if (!status.ok()) {
+                   VLOG(2) << "Component function execution failed: " << status;
+                   const string function_and_msg = strings::StrCat(
+                       errors::FormatFunctionForError(data->function_name_),
+                       " ", status.error_message());
+                   refcounted_done->UpdateStatus(
+                       Status(status.code(), function_and_msg));
+                 } else {
+                   for (int i = 0; i < comp_rets->size(); ++i) {
+                     (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+                   }
+                 }
+                 delete comp_rets;
+                 // refcounted_done is thread-safe
+                 refcounted_done->Unref();
+               });
     } else {
       opts_copy.remote_execution = true;
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index e1b36736555..ef110f3edc7 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -217,7 +217,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                    });
     done2.WaitForNotification();
     EXPECT_TRUE(errors::IsNotFound(status)) << "Actual status: " << status;
-    EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
+    EXPECT_TRUE(absl::StrContains(status.error_message(), "not found."));
 
     return Status::OK();
   }
@@ -479,7 +479,7 @@ void TestTwoDeviceMult(
   if (!error.empty()) {
     EXPECT_TRUE(errors::IsInvalidArgument(status))
         << "Actual status: " << status;
-    EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
+    EXPECT_TRUE(absl::StrContains(status.error_message(), error))
         << "Actual error message: " << status.error_message();
     return;
   }
@@ -505,11 +505,11 @@ void TestTwoDeviceInputOutput(
   FunctionLibraryRuntime::Options opts;
   opts.rendezvous = fixture->rendezvous_;
   Tensor x1 = test::AsTensor<float>({1, 2});
-  if (str_util::StrContains(inst_opts.input_devices[0], "GPU")) {
+  if (absl::StrContains(inst_opts.input_devices[0], "GPU")) {
     x1 = fixture->CPUToGPU(x1);
   }
   Tensor x2 = test::AsTensor<float>({10, 20});
-  if (str_util::StrContains(inst_opts.input_devices[1], "GPU")) {
+  if (absl::StrContains(inst_opts.input_devices[1], "GPU")) {
     x2 = fixture->CPUToGPU(x2);
   }
   Tensor y1;
@@ -517,7 +517,7 @@ void TestTwoDeviceInputOutput(
   TF_CHECK_OK(fixture->Run("TwoDeviceInputOutput", opts, {{"T", DT_FLOAT}},
                            inst_opts, {x1, x2}, {&y1, &y2}));
 
-  if (str_util::StrContains(inst_opts.output_devices[0], "GPU")) {
+  if (absl::StrContains(inst_opts.output_devices[0], "GPU")) {
     EXPECT_TRUE(IsCUDATensor(y1));
     y1 = fixture->GPUToCPU(y1);
   } else {
@@ -525,7 +525,7 @@ void TestTwoDeviceInputOutput(
   }
   test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({2, 4}));
 
-  if (str_util::StrContains(inst_opts.output_devices[1], "GPU")) {
+  if (absl::StrContains(inst_opts.output_devices[1], "GPU")) {
     EXPECT_TRUE(IsCUDATensor(y2));
     y2 = fixture->GPUToCPU(y2);
   } else {
@@ -607,7 +607,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListInput) {
       "FuncWithListInput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
       MakeOptions("CPU:0", {"CPU:0"}, {}), &handle);
   ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
-  ASSERT_TRUE(str_util::StrContains(
+  ASSERT_TRUE(absl::StrContains(
       status.error_message(),
       "FuncWithListInput has an input named \"x1\" that is a list of tensors"))
       << "Actual error message: " << status.error_message();
@@ -621,7 +621,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListOutput) {
       "FuncWithListOutput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
       MakeOptions("CPU:0", {}, {"CPU:0"}), &handle);
   ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
-  ASSERT_TRUE(str_util::StrContains(
+  ASSERT_TRUE(absl::StrContains(
       status.error_message(),
       "FuncWithListOutput has an output named \"y\" that is a list of tensors"))
       << "Actual error message: " << status.error_message();
@@ -747,7 +747,7 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_PlacerError) {
       "ResourceOutput", test::function::Attrs({{"T", DT_FLOAT}}), inst_opts,
       &handle);
   ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
-  ASSERT_TRUE(str_util::StrContains(status.error_message(), "Cannot place"));
+  ASSERT_TRUE(absl::StrContains(status.error_message(), "Cannot place"));
 }
 
 REGISTER_OP("BrokenOp")
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index c20cc74bf70..c6ba1ec58ff 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -174,7 +174,7 @@ Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
   // Precondition: device_names must be sorted so that all devices in
   // the same task are adjacent.
   VLOG(2) << "Sorted task names: "
-          << str_util::Join(col_params->instance.task_names, ", ");
+          << absl::StrJoin(col_params->instance.task_names, ", ");
   std::vector<int> dev_per_task;
   const string* prior_task_name = &col_params->instance.task_names[0];
   int dev_count = 1;
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index 4dbe113e44e..f37d25f4e13 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -57,7 +57,7 @@ const string RegisteredFactoriesErrorMessageLocked() {
     factory_types.push_back(session_factory.first);
   }
   return strings::StrCat("Registered factories are {",
-                         str_util::Join(factory_types, ", "), "}.");
+                         absl::StrJoin(factory_types, ", "), "}.");
 }
 string SessionOptionsToString(const SessionOptions& options) {
   return strings::StrCat("target: \"", options.target,
@@ -102,7 +102,7 @@ Status SessionFactory::GetFactory(const SessionOptions& options,
         "Multiple session factories registered for the given session "
         "options: {",
         SessionOptionsToString(options), "} Candidate factories are {",
-        str_util::Join(factory_types, ", "), "}. ",
+        absl::StrJoin(factory_types, ", "), "}. ",
         RegisteredFactoriesErrorMessageLocked());
   } else {
     return errors::NotFound(
diff --git a/tensorflow/core/common_runtime/session_test.cc b/tensorflow/core/common_runtime/session_test.cc
index 1fa5aad60c2..960e611fe35 100644
--- a/tensorflow/core/common_runtime/session_test.cc
+++ b/tensorflow/core/common_runtime/session_test.cc
@@ -32,7 +32,7 @@ TEST(SessionTest, InvalidTargetReturnsNull) {
   Session* session;
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.error_message(),
       "No session factory registered for the given session options"));
 }
@@ -44,7 +44,7 @@ class FakeSessionFactory : public SessionFactory {
   FakeSessionFactory() {}
 
   bool AcceptsOptions(const SessionOptions& options) override {
-    return str_util::StartsWith(options.target, "fake");
+    return absl::StartsWith(options.target, "fake");
   }
 
   Status NewSession(const SessionOptions& options,
@@ -70,9 +70,9 @@ TEST(SessionTest, MultipleFactoriesForTarget) {
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::INTERNAL);
   EXPECT_TRUE(
-      str_util::StrContains(s.error_message(), "Multiple session factories"));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_1"));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_2"));
+      absl::StrContains(s.error_message(), "Multiple session factories"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "FAKE_SESSION_1"));
+  EXPECT_TRUE(absl::StrContains(s.error_message(), "FAKE_SESSION_2"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 3f8191f91df..837b320b015 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -162,8 +162,8 @@ TEST_F(ShapeRefinerTest, BadShapes) {
   // an error.
   Status s = m.AddNode(mm.node());
   ASSERT_FALSE(s.ok());
-  ASSERT_TRUE(str_util::StrContains(
-      s.error_message(), "Dimensions must be equal, but are 1 and 2"));
+  ASSERT_TRUE(absl::StrContains(s.error_message(),
+                                "Dimensions must be equal, but are 1 and 2"));
 }
 
 TEST_F(ShapeRefinerTest, SetShape) {
@@ -1051,8 +1051,8 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
   TF_ASSERT_OK(m.AddNode(pack.node()));
-  EXPECT_TRUE(str_util::StrContains(m.AddNode(result).error_message(),
-                                    "but is rank 2"));
+  EXPECT_TRUE(
+      absl::StrContains(m.AddNode(result).error_message(), "but is rank 2"));
 }
 
 TEST_F(ShapeRefinerTest, ConstantValueAsShape_Concat) {
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 318cfec21a8..7e9021549b1 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -94,7 +94,7 @@ void NodeExecStatsWrapper::Done(const string& device) {
   } else {
     text =
         strings::StrCat(memory, node_->name(), " = ", node_->type_string(), "(",
-                        str_util::Join(node_->requested_inputs(), ", "), ")");
+                        absl::StrJoin(node_->requested_inputs(), ", "), ")");
   }
   stats_->set_timeline_label(text);
   step_stats_collector_->Save(device, this);
diff --git a/tensorflow/core/common_runtime/testlib_ops.cc b/tensorflow/core/common_runtime/testlib_ops.cc
index a0139c3ee5d..0291c213290 100644
--- a/tensorflow/core/common_runtime/testlib_ops.cc
+++ b/tensorflow/core/common_runtime/testlib_ops.cc
@@ -27,19 +27,27 @@ REGISTER_OP("Error")
     .Output("out: T")
     .Attr("T: type")
     .Attr("message: string")
+    .Attr("log_error: bool = false")
     .SetShapeFn(shape_inference::UnknownShape);
 class ErrorOp : public OpKernel {
  public:
   explicit ErrorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("message", &errmsg_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("log_error", &log_error_));
   }
 
   void Compute(OpKernelContext* ctx) override {
+    // Log only when CancellationManager is set to skip logging when Compute()
+    // is called during the optimization phase.
+    if (ctx->cancellation_manager() && log_error_) {
+      LOG(ERROR) << "ErrorOp: " << errmsg_;
+    }
     ctx->SetStatus(errors::Internal(errmsg_));
   }
 
  private:
   string errmsg_;
+  bool log_error_ = false;
 };
 REGISTER_KERNEL_BUILDER(Name("Error").Device(DEVICE_CPU), ErrorOp);
 
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index f8c07dde46c..c326cf9f323 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -21,18 +21,18 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
-    "tf_copts",
     "tf_cc_test",
+    "tf_copts",
     "tf_cuda_library",
 )
 
 # For platform specific build config
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_additional_all_protos",
     "tf_kernel_tests_linkstatic",
     "tf_proto_library",
     "tf_proto_library_cc",
-    "tf_additional_all_protos",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -132,6 +132,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index b69eb1da39e..d5498ed6ffa 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -29,7 +29,7 @@ namespace {
 
 // TODO(cais): Switch to safe_strtob when available.
 Status ParseBoolString(const string& bool_str, bool* bool_val) {
-  const string lower_bool_str = str_util::Lowercase(bool_str);
+  const string lower_bool_str = absl::AsciiStrToLower(bool_str);
   if (lower_bool_str == "false" || lower_bool_str == "f" ||
       lower_bool_str == "0") {
     *bool_val = false;
@@ -430,7 +430,7 @@ Status DebugNodeInserter::SetDebugNodeAttributes(
     return errors::InvalidArgument(
         unfulfilled_keys.size(),
         " attribute key(s) were not valid for debug node ", debug_node->name(),
-        ": ", str_util::Join(unfulfilled_keys, ", "));
+        ": ", absl::StrJoin(unfulfilled_keys, ", "));
   }
 }
 
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
index 803cce85585..c857f12e755 100644
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@@ -66,11 +66,11 @@ class GrpcDebugTest : public ::testing::Test {
   void ClearEnabledWatchKeys() { DebugGrpcIO::ClearEnabledWatchKeys(); }
 
   const int64 GetChannelConnectionTimeoutMicros() {
-    return DebugGrpcIO::channel_connection_timeout_micros;
+    return DebugGrpcIO::channel_connection_timeout_micros_;
   }
 
   void SetChannelConnectionTimeoutMicros(const int64 timeout) {
-    DebugGrpcIO::channel_connection_timeout_micros = timeout;
+    DebugGrpcIO::channel_connection_timeout_micros_ = timeout;
   }
 
   ServerData server_data_;
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index ebcb0460034..8e6042116b5 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #pragma comment(lib, "Ws2_32.lib")
 #endif  // #ifndef PLATFORM_WINDOWS
 
+#include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
 #include "tensorflow/core/debug/debug_callback_registry.h"
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -371,7 +373,7 @@ Status DebugIO::PublishDebugMetadata(
 
   Status status;
   for (const string& url : debug_urls) {
-    if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
+    if (absl::StartsWith(absl::AsciiStrToLower(url), kGrpcURLScheme)) {
 #ifndef PLATFORM_WINDOWS
       Event grpc_event;
 
@@ -392,7 +394,7 @@ Status DebugIO::PublishDebugMetadata(
 #else
       GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
-    } else if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
+    } else if (absl::StartsWith(absl::AsciiStrToLower(url), kFileURLScheme)) {
       const string dump_root_dir = url.substr(strlen(kFileURLScheme));
       const string core_metadata_path = AppendTimestampToFilePath(
           io::JoinPath(
@@ -413,12 +415,12 @@ Status DebugIO::PublishDebugMetadata(
 Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string>& debug_urls,
+                                   const gtl::ArraySlice<string> debug_urls,
                                    const bool gated_grpc) {
   int32 num_failed_urls = 0;
   std::vector<Status> fail_statuses;
   for (const string& url : debug_urls) {
-    if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
+    if (absl::StartsWith(absl::AsciiStrToLower(url), kFileURLScheme)) {
       const string dump_root_dir = url.substr(strlen(kFileURLScheme));
 
       const int64 tensorBytes =
@@ -427,7 +429,7 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
         return errors::ResourceExhausted(
             "TensorFlow Debugger has exhausted file-system byte-size "
             "allowance (",
-            DebugFileIO::globalDiskBytesLimit, "), therefore it cannot ",
+            DebugFileIO::global_disk_bytes_limit_, "), therefore it cannot ",
             "dump an additional ", tensorBytes, " byte(s) of tensor data ",
             "for the debug tensor ", debug_node_key.node_name, ":",
             debug_node_key.output_slot, ". You may use the environment ",
@@ -440,7 +442,7 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
         num_failed_urls++;
         fail_statuses.push_back(s);
       }
-    } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
+    } else if (absl::StartsWith(absl::AsciiStrToLower(url), kGrpcURLScheme)) {
 #ifndef PLATFORM_WINDOWS
       Status s = DebugGrpcIO::SendTensorThroughGrpcStream(
           debug_node_key, tensor, wall_time_us, url, gated_grpc);
@@ -452,7 +454,7 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
 #else
       GRPC_OSS_WINDOWS_UNIMPLEMENTED_ERROR;
 #endif
-    } else if (str_util::Lowercase(url).find(kMemoryURLScheme) == 0) {
+    } else if (absl::StartsWith(absl::AsciiStrToLower(url), kMemoryURLScheme)) {
       const string dump_root_dir = url.substr(strlen(kMemoryURLScheme));
       auto* callback_registry = DebugCallbackRegistry::singleton();
       auto* callback = callback_registry->GetCallback(dump_root_dir);
@@ -482,7 +484,7 @@ Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
 Status DebugIO::PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string>& debug_urls) {
+                                   const gtl::ArraySlice<string> debug_urls) {
   return PublishDebugTensor(debug_node_key, tensor, wall_time_us, debug_urls,
                             false);
 }
@@ -502,7 +504,7 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
 
   Status status = Status::OK();
   for (const string& debug_url : debug_urls) {
-    if (debug_url.find(kFileURLScheme) == 0) {
+    if (absl::StartsWith(debug_url, kFileURLScheme)) {
       const string dump_root_dir =
           io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
                        DebugNodeKey::DeviceNameToDevicePath(device_name));
@@ -513,7 +515,7 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
 
       status.Update(
           DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
-    } else if (debug_url.find(kGrpcURLScheme) == 0) {
+    } else if (absl::StartsWith(debug_url, kGrpcURLScheme)) {
 #ifndef PLATFORM_WINDOWS
       status.Update(PublishEncodedGraphDefInChunks(buf, device_name, now_micros,
                                                    debug_url));
@@ -567,7 +569,7 @@ bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
 bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
                                  const string& debug_url) {
 #ifndef PLATFORM_WINDOWS
-  if (debug_url.find(kGrpcURLScheme) != 0) {
+  if (debug_url != kGrpcURLScheme) {
     return true;
   } else {
     return DebugGrpcIO::IsReadGateOpen(debug_url, watch_key);
@@ -578,7 +580,7 @@ bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
 }
 
 Status DebugIO::CloseDebugURL(const string& debug_url) {
-  if (debug_url.find(DebugIO::kGrpcURLScheme) == 0) {
+  if (absl::StartsWith(debug_url, DebugIO::kGrpcURLScheme)) {
 #ifndef PLATFORM_WINDOWS
     return DebugGrpcIO::CloseGrpcStream(debug_url);
 #else
@@ -687,30 +689,30 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
 }
 
 // Default total disk usage limit: 100 GBytes
-const uint64 DebugFileIO::defaultGlobalDiskBytesLimit = 107374182400L;
-uint64 DebugFileIO::globalDiskBytesLimit = 0;
-uint64 DebugFileIO::diskBytesUsed = 0;
+const uint64 DebugFileIO::kDefaultGlobalDiskBytesLimit = 107374182400L;
+uint64 DebugFileIO::global_disk_bytes_limit_ = 0;
+uint64 DebugFileIO::disk_bytes_used_ = 0;
 
-mutex DebugFileIO::bytes_mu(LINKER_INITIALIZED);
+mutex DebugFileIO::bytes_mu_(LINKER_INITIALIZED);
 
 bool DebugFileIO::requestDiskByteUsage(uint64 bytes) {
-  mutex_lock l(bytes_mu);
-  if (globalDiskBytesLimit == 0) {
+  mutex_lock l(bytes_mu_);
+  if (global_disk_bytes_limit_ == 0) {
     const char* env_tfdbg_disk_bytes_limit = getenv("TFDBG_DISK_BYTES_LIMIT");
     if (env_tfdbg_disk_bytes_limit == nullptr ||
         strlen(env_tfdbg_disk_bytes_limit) == 0) {
-      globalDiskBytesLimit = defaultGlobalDiskBytesLimit;
+      global_disk_bytes_limit_ = kDefaultGlobalDiskBytesLimit;
     } else {
       strings::safe_strtou64(string(env_tfdbg_disk_bytes_limit),
-                             &globalDiskBytesLimit);
+                             &global_disk_bytes_limit_);
     }
   }
 
   if (bytes == 0) {
     return true;
   }
-  if (diskBytesUsed + bytes < globalDiskBytesLimit) {
-    diskBytesUsed += bytes;
+  if (disk_bytes_used_ + bytes < global_disk_bytes_limit_) {
+    disk_bytes_used_ += bytes;
     return true;
   } else {
     return false;
@@ -718,8 +720,8 @@ bool DebugFileIO::requestDiskByteUsage(uint64 bytes) {
 }
 
 void DebugFileIO::resetDiskByteUsage() {
-  mutex_lock l(bytes_mu);
-  diskBytesUsed = 0;
+  mutex_lock l(bytes_mu_);
+  disk_bytes_used_ = 0;
 }
 
 #ifndef PLATFORM_WINDOWS
@@ -786,9 +788,9 @@ Status DebugGrpcChannel::ReceiveServerRepliesAndClose() {
   }
 }
 
-mutex DebugGrpcIO::streams_mu(LINKER_INITIALIZED);
+mutex DebugGrpcIO::streams_mu_(LINKER_INITIALIZED);
 
-int64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
+int64 DebugGrpcIO::channel_connection_timeout_micros_ = 900 * 1000 * 1000;
 // TODO(cais): Make this configurable?
 
 const size_t DebugGrpcIO::kGrpcMessageSizeLimitBytes = 4000 * 1024;
@@ -846,19 +848,19 @@ Status DebugGrpcIO::ReceiveEventReplyProtoThroughGrpcStream(
 Status DebugGrpcIO::GetOrCreateDebugGrpcChannel(
     const string& grpc_stream_url, DebugGrpcChannel** debug_grpc_channel) {
   const string addr_with_path =
-      grpc_stream_url.find(DebugIO::kGrpcURLScheme) == 0
+      absl::StartsWith(grpc_stream_url, DebugIO::kGrpcURLScheme)
           ? grpc_stream_url.substr(strlen(DebugIO::kGrpcURLScheme))
           : grpc_stream_url;
   const string server_stream_addr =
       addr_with_path.substr(0, addr_with_path.find('/'));
   {
-    mutex_lock l(streams_mu);
+    mutex_lock l(streams_mu_);
     std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
         stream_channels = GetStreamChannels();
     if (stream_channels->find(grpc_stream_url) == stream_channels->end()) {
       std::unique_ptr<DebugGrpcChannel> channel(
           new DebugGrpcChannel(server_stream_addr));
-      TF_RETURN_IF_ERROR(channel->Connect(channel_connection_timeout_micros));
+      TF_RETURN_IF_ERROR(channel->Connect(channel_connection_timeout_micros_));
       stream_channels->insert(
           std::make_pair(grpc_stream_url, std::move(channel)));
     }
@@ -907,7 +909,7 @@ bool DebugGrpcIO::IsWriteGateOpen(const string& grpc_debug_url,
 }
 
 Status DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
-  mutex_lock l(streams_mu);
+  mutex_lock l(streams_mu_);
 
   std::unordered_map<string, std::unique_ptr<DebugGrpcChannel>>*
       stream_channels = GetStreamChannels();
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index 842234d433b..39062d17298 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -78,14 +78,14 @@ class DebugIO {
   static Status PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string>& debug_urls,
+                                   const gtl::ArraySlice<string> debug_urls,
                                    const bool gated_grpc);
 
   // Convenience overload of the method above for no gated_grpc by default.
   static Status PublishDebugTensor(const DebugNodeKey& debug_node_key,
                                    const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string>& debug_urls);
+                                   const gtl::ArraySlice<string> debug_urls);
 
   // Publishes a graph to a set of debug URLs.
   //
@@ -211,7 +211,7 @@ class DebugFileIO {
   // Reset the disk byte usage to zero.
   static void resetDiskByteUsage();
 
-  static uint64 globalDiskBytesLimit;
+  static uint64 global_disk_bytes_limit_;
 
  private:
   // Encapsulates the Tensor in an Event protobuf and write it to file.
@@ -226,11 +226,11 @@ class DebugFileIO {
   static Status RecursiveCreateDir(Env* env, const string& dir);
 
   // Tracks how much disk has been used so far.
-  static uint64 diskBytesUsed;
-  // Mutex for thread-safe access to diskBytesUsed.
-  static mutex bytes_mu;
+  static uint64 disk_bytes_used_;
+  // Mutex for thread-safe access to disk_bytes_used_.
+  static mutex bytes_mu_;
   // Default limit for the disk space.
-  static const uint64 defaultGlobalDiskBytesLimit;
+  static const uint64 kDefaultGlobalDiskBytesLimit;
 
   friend class DiskUsageLimitTest;
 };
@@ -266,7 +266,7 @@ class DebugGrpcChannel {
   //   server_stream_addr: Address (host name and port) of the debug stream
   //     server implementing the EventListener service (see
   //     debug_service.proto). E.g., "127.0.0.1:12345".
-  DebugGrpcChannel(const string& server_stream_addr);
+  explicit DebugGrpcChannel(const string& server_stream_addr);
 
   virtual ~DebugGrpcChannel() {}
 
@@ -418,8 +418,8 @@ class DebugGrpcIO {
   // Clear enabled debug op state from all debug URLs (if any).
   static void ClearEnabledWatchKeys();
 
-  static mutex streams_mu;
-  static int64 channel_connection_timeout_micros;
+  static mutex streams_mu_;
+  static int64 channel_connection_timeout_micros_;
 
   friend class GrpcDebugTest;
   friend class DebugNumericSummaryOpTest;
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 0926a82fade..928a82b0611 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -461,7 +461,7 @@ class DiskUsageLimitTest : public ::testing::Test {
   void Initialize() {
     setenv("TFDBG_DISK_BYTES_LIMIT", "", 1);
     DebugFileIO::resetDiskByteUsage();
-    DebugFileIO::globalDiskBytesLimit = 0;
+    DebugFileIO::global_disk_bytes_limit_ = 0;
   }
 };
 
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 7b2546050a6..e884ecd762f 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -3,7 +3,12 @@
 # to be distributed and performed in parallel across multiple
 # processes.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -29,10 +34,6 @@ load(
     "tf_cuda_tests_tags",
 )
 
-package(default_visibility = [
-    "//tensorflow:internal",
-])
-
 cc_library(
     name = "partial_run_mgr",
     srcs = ["partial_run_mgr.cc"],
@@ -571,6 +572,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index a5ec95f7ead..c35eb465841 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -147,7 +147,7 @@ BaseRemoteRendezvous::~BaseRemoteRendezvous() {
 // and device name and does no lookups in the worker->device_mgr.
 static bool IsLocalDevice(const StringPiece worker_name,
                           const StringPiece device_name) {
-  return str_util::StartsWith(device_name, worker_name);
+  return absl::StartsWith(device_name, worker_name);
 }
 
 Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
@@ -321,6 +321,8 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
         [this, parsed, done](
             const Status& status, const Rendezvous::Args& send_args,
             const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
+          VLOG(2) << "RemoteRendezvous Finished Recv " << this << " "
+                  << parsed.FullKey();
           Tensor* out = new Tensor;
           StatusCallback final_callback = [done, send_args, recv_args, out,
                                            is_dead](const Status& s) {
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 3a60ff0e02a..d2fcd9cdd2c 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -134,7 +134,7 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
     worker_session_->worker_cache->ListWorkers(&workers);
     return errors::InvalidArgument(
         "Could not find worker with target: ", options.target,
-        " Available workers: ", str_util::Join(workers, ", "));
+        " Available workers: ", absl::StrJoin(workers, ", "));
   }
 
   // Make RPC and obtain a graph handle.
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 422f07339b7..a8738291e5d 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/distributed_runtime/cancellable_call.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
@@ -243,7 +244,7 @@ Status CollectiveParamResolverDistributed::UpdateGroupCache(
   CHECK_EQ(gr->task_set.size(), gr->group.num_tasks);
   gr->group.runtime_details.communicator_key = resp.communicator_key();
   VLOG(2) << "Group communicator_key="
-          << str_util::CEscape(gr->group.runtime_details.communicator_key);
+          << absl::CEscape(gr->group.runtime_details.communicator_key);
   {
     // Group membership should never change. Once a record is in group_table_
     // it never gets removed.
@@ -251,7 +252,7 @@ Status CollectiveParamResolverDistributed::UpdateGroupCache(
     auto it = group_table_.find(gr->group.group_key);
     if (it == group_table_.end()) {
       VLOG(2) << "UpdateGroupCache: communicator_key="
-              << str_util::CEscape(gr->group.runtime_details.communicator_key);
+              << absl::CEscape(gr->group.runtime_details.communicator_key);
       group_table_[gr->group.group_key] = std::move(gr);
     } else {
       auto& previous_gr = group_table_[gr->group.group_key];
@@ -260,10 +261,9 @@ Status CollectiveParamResolverDistributed::UpdateGroupCache(
         return errors::Internal(
             "UpdateGroupCache: CompleteGroupResponse for group ",
             gr->group.group_key, " gives communicator_key=",
-            str_util::CEscape(gr->group.runtime_details.communicator_key),
+            absl::CEscape(gr->group.runtime_details.communicator_key),
             " but cache already holds communicator_key=",
-            str_util::CEscape(
-                previous_gr->group.runtime_details.communicator_key));
+            absl::CEscape(previous_gr->group.runtime_details.communicator_key));
       }
     }
   }
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index cac17ab51ab..9c90d7a7a38 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -97,8 +97,7 @@ class FakeWorker : public TestWorkerInterface {
     });
     buf_rendezvous_.ConsumeBuf(
         request->buf_rendezvous_key(),
-        [this, opts, request, response, done](const Status& s,
-                                              BufRendezvous::Hook* h) {
+        [opts, response, done](const Status& s, BufRendezvous::Hook* h) {
           if (s.ok()) {
             opts->ClearCancelCallback();
             // Since this is not really RDMA into pre-allocated memory send the
@@ -266,7 +265,7 @@ TEST_F(CollRMADistTest, ProdFirstOK) {
   wi->buf_rendezvous()->ProvideBuf(
       kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &expected_value_,
       AllocatorAttributes(),
-      [this, &producer_note, &producer_status](const Status& s) {
+      [&producer_note, &producer_status](const Status& s) {
         producer_status.Update(s);
         producer_note.Notify();
       });
@@ -280,7 +279,7 @@ TEST_F(CollRMADistTest, ProdFirstOK) {
       false,                                              // peer_is_local
       kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
       device_locality_, 0 /*dev_to_dev_stream_index*/,
-      [this, &consumer_status, &consumer_note](const Status& s) {
+      [&consumer_status, &consumer_note](const Status& s) {
         consumer_status = s;
         consumer_note.Notify();
       });
@@ -308,14 +307,14 @@ TEST_F(CollRMADistTest, ConsFirstOK) {
       false,                                              // peer_is_local
       kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
       device_locality_, 0 /*dev_to_dev_stream_index*/,
-      [this, &consumer_status, &consumer_note](const Status& s) {
+      [&consumer_status, &consumer_note](const Status& s) {
         consumer_status = s;
         consumer_note.Notify();
       });
   wi->buf_rendezvous()->ProvideBuf(
       kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &expected_value_,
       AllocatorAttributes(),
-      [this, &producer_note, &producer_status](const Status& s) {
+      [&producer_note, &producer_status](const Status& s) {
         producer_status.Update(s);
         producer_note.Notify();
       });
@@ -340,7 +339,7 @@ TEST_F(CollRMADistTest, ConsFirstAbort) {
       false,                                              // peer_is_local
       kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
       device_locality_, 0 /*dev_to_dev_stream_index*/,
-      [this, &consumer_status, &consumer_note](const Status& s) {
+      [&consumer_status, &consumer_note](const Status& s) {
         consumer_status = s;
         consumer_note.Notify();
       });
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index 0bc370c93b5..521bc237957 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -192,7 +192,7 @@ TEST_F(DeviceResDistTest, Workers3Devices4) {
         Status status;
         DeviceLocality locality;
         dres->GetLocalityAsync(dev_name, task_name, &locality,
-                               [this, &note, &status](const Status& s) {
+                               [&note, &status](const Status& s) {
                                  status = s;
                                  note.Notify();
                                });
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 6f08943e2d2..2bac492c44a 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//tensorflow:internal",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 448319fd3ed..eafa420adf6 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -88,12 +88,18 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
     return tensorflow::errors::Internal(
         "invalid eager env_ or env_->rendezvous_mgr.");
   }
-  std::vector<std::unique_ptr<tensorflow::Device>> devices;
+  std::vector<DeviceAttributes> cluster_device_attributes;
+  cluster_device_attributes.reserve(
+      request->cluster_device_attributes().size());
+  for (const auto& cluster_device : request->cluster_device_attributes()) {
+    cluster_device_attributes.push_back(cluster_device);
+  }
 
   auto* r = env_->rendezvous_mgr->Find(request->rendezvous_id());
   auto session_name = strings::StrCat("eager_", request->rendezvous_id());
   TF_RETURN_IF_ERROR(env_->session_mgr->CreateSession(
-      session_name, request->server_def(), true));
+      session_name, request->server_def(), request->cluster_device_attributes(),
+      true));
 
   std::shared_ptr<WorkerSession> worker_session;
   TF_RETURN_IF_ERROR(env_->session_mgr->WorkerSessionForSession(
@@ -104,10 +110,18 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   // Initialize remote tensor communication based on worker session.
   TF_RETURN_IF_ERROR(r->Initialize(worker_session.get()));
 
+  std::function<Rendezvous*(const int64)> rendezvous_creator =
+      [worker_session, this](const int64 step_id) {
+        auto* r = env_->rendezvous_mgr->Find(step_id);
+        r->Initialize(worker_session.get()).IgnoreError();
+        return r;
+      };
+
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      request->async(), device_mgr, false, r, nullptr);
+      request->async(), device_mgr, false, r, nullptr,
+      worker_session->cluster_flr.get(), std::move(rendezvous_creator));
 
   std::vector<DeviceAttributes> device_attributes;
   device_mgr->ListDeviceAttributes(&device_attributes);
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 5c55067d6cc..e3b788f437c 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -92,7 +92,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
           n->name(),
           NodeDetails(n->type_string(),
                       strings::StrCat(
-                          "(", str_util::Join(n->requested_inputs(), ", "))));
+                          "(", absl::StrJoin(n->requested_inputs(), ", "))));
     }
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index d09a85c6a52..0b181366353 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -1,18 +1,20 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
-package(default_visibility = [
-    "//tensorflow:internal",
-])
-
 cc_library(
     name = "grpc_eager_service",
-    srcs = ["grpc_eager_service.cc"],
+    srcs = ["grpc_eager_service.h"],
     hdrs = ["grpc_eager_service.h"],
     deps = [
         "//tensorflow:grpc++",
         "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/stream_executor/platform",
     ],
 )
 
@@ -21,6 +23,7 @@ cc_library(
     srcs = ["grpc_eager_client.cc"],
     hdrs = ["grpc_eager_client.h"],
     deps = [
+        ":grpc_eager_service",
         "//tensorflow:grpc++",
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
@@ -29,7 +32,6 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
         "//tensorflow/core/distributed_runtime/rpc:grpc_state",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service",
     ],
 )
 
@@ -40,6 +42,7 @@ cc_library(
     deps = [
         ":grpc_eager_service",
         "//tensorflow:grpc++",
+        "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core/distributed_runtime/eager:eager_service_impl",
@@ -48,6 +51,5 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
deleted file mode 100644
index ab3aa3fd1de..00000000000
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
-
-#include "grpcpp/impl/codegen/async_stream.h"
-#include "grpcpp/impl/codegen/async_unary_call.h"
-#include "grpcpp/impl/codegen/channel_interface.h"
-#include "grpcpp/impl/codegen/client_unary_call.h"
-#include "grpcpp/impl/codegen/method_handler_impl.h"
-#include "grpcpp/impl/codegen/rpc_service_method.h"
-#include "grpcpp/impl/codegen/service_type.h"
-#include "grpcpp/impl/codegen/sync_stream.h"
-
-namespace tensorflow {
-namespace eager {
-
-namespace grpc {
-
-static const char* grpcEagerService_method_names[] = {
-    "/tensorflow.eager.EagerService/CreateContext",
-    "/tensorflow.eager.EagerService/Enqueue",
-    "/tensorflow.eager.EagerService/WaitQueueDone",
-    "/tensorflow.eager.EagerService/KeepAlive",
-    "/tensorflow.eager.EagerService/CloseContext",
-    "/tensorflow.eager.EagerService/RegisterFunction",
-    "/tensorflow.eager.EagerService/SendTensor",
-};
-
-std::unique_ptr<EagerService::Stub> EagerService::NewStub(
-    const std::shared_ptr< ::grpc::ChannelInterface>& channel,
-    const ::grpc::StubOptions& options) {
-  std::unique_ptr<EagerService::Stub> stub(new EagerService::Stub(channel));
-  return stub;
-}
-
-EagerService::Stub::Stub(
-    const std::shared_ptr< ::grpc::ChannelInterface>& channel)
-    : channel_(channel),
-      rpcmethod_CreateContext_(grpcEagerService_method_names[0],
-                               ::grpc::internal::RpcMethod::NORMAL_RPC,
-                               channel),
-      rpcmethod_Enqueue_(grpcEagerService_method_names[1],
-                         ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
-      rpcmethod_WaitQueueDone_(grpcEagerService_method_names[2],
-                               ::grpc::internal::RpcMethod::NORMAL_RPC,
-                               channel),
-      rpcmethod_KeepAlive_(grpcEagerService_method_names[3],
-                           ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
-      rpcmethod_CloseContext_(grpcEagerService_method_names[4],
-                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
-      rpcmethod_RegisterFunction_(grpcEagerService_method_names[5],
-                                  ::grpc::internal::RpcMethod::NORMAL_RPC,
-                                  channel),
-      rpcmethod_SendTensor_(grpcEagerService_method_names[6],
-                            ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
-
-::grpc::Status EagerService::Stub::CreateContext(
-    ::grpc::ClientContext* context, const CreateContextRequest& request,
-    CreateContextResponse* response) {
-  return ::grpc::internal::BlockingUnaryCall(
-      channel_.get(), rpcmethod_CreateContext_, context, request, response);
-}
-
-::grpc::Status EagerService::Stub::Enqueue(::grpc::ClientContext* context,
-                                           const EnqueueRequest& request,
-                                           EnqueueResponse* response) {
-  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_Enqueue_,
-                                             context, request, response);
-}
-
-::grpc::Status EagerService::Stub::WaitQueueDone(
-    ::grpc::ClientContext* context, const WaitQueueDoneRequest& request,
-    WaitQueueDoneResponse* response) {
-  return ::grpc::internal::BlockingUnaryCall(
-      channel_.get(), rpcmethod_WaitQueueDone_, context, request, response);
-}
-
-::grpc::Status EagerService::Stub::KeepAlive(::grpc::ClientContext* context,
-                                             const KeepAliveRequest& request,
-                                             KeepAliveResponse* response) {
-  return ::grpc::internal::BlockingUnaryCall(
-      channel_.get(), rpcmethod_KeepAlive_, context, request, response);
-}
-
-::grpc::Status EagerService::Stub::CloseContext(
-    ::grpc::ClientContext* context, const CloseContextRequest& request,
-    CloseContextResponse* response) {
-  return ::grpc::internal::BlockingUnaryCall(
-      channel_.get(), rpcmethod_CloseContext_, context, request, response);
-}
-
-::grpc::Status EagerService::Stub::RegisterFunction(
-    ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
-    RegisterFunctionResponse* response) {
-  return ::grpc::internal::BlockingUnaryCall(
-      channel_.get(), rpcmethod_RegisterFunction_, context, request, response);
-}
-
-::grpc::Status EagerService::Stub::SendTensor(::grpc::ClientContext* context,
-                                              const SendTensorRequest& request,
-                                              SendTensorResponse* response) {
-  return ::grpc::internal::BlockingUnaryCall(
-      channel_.get(), rpcmethod_SendTensor_, context, request, response);
-}
-
-EagerService::AsyncService::AsyncService() {
-  for (int i = 0; i < 7; ++i) {
-    AddMethod(new ::grpc::internal::RpcServiceMethod(
-        grpcEagerService_method_names[i],
-        ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
-    ::grpc::Service::MarkMethodAsync(i);
-  }
-}
-
-EagerService::AsyncService::~AsyncService() {}
-
-}  // namespace grpc
-
-}  // namespace eager
-}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
index 61b02370819..6aafc8f5cdd 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,160 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
 
-#include "grpcpp/impl/codegen/async_stream.h"
-#include "grpcpp/impl/codegen/async_unary_call.h"
-#include "grpcpp/impl/codegen/proto_utils.h"
-#include "grpcpp/impl/codegen/rpc_method.h"
-#include "grpcpp/impl/codegen/service_type.h"
-#include "grpcpp/impl/codegen/status.h"
-#include "grpcpp/impl/codegen/stub_options.h"
-#include "grpcpp/impl/codegen/sync_stream.h"
+#include "tensorflow/core/protobuf/eager_service.grpc.pb.h"
+#include "tensorflow/stream_executor/platform/port.h"
 
-#include "tensorflow/core/protobuf/eager_service.pb.h"
+#ifndef PLATFORM_GOOGLE
 
 namespace tensorflow {
 namespace eager {
-
 namespace grpc {
 
-// GRPC stubs of `tensorflow.eager.EagerService`, based on the
-// definition in "//tensorflow/core/protobuf/eager_service.proto",
-// and the gRPC generated stub and service classes.
-// See that file for the definition of methods and messages.
-// Similar to the Master/Worker tensorflow GRPC services, this is not gen'ned
-// via a rule, but included as an implementation directly.
-class EagerService final {
- public:
-  class StubInterface {
-   public:
-    virtual ~StubInterface() {}
-    virtual ::grpc::Status CreateContext(::grpc::ClientContext* context,
-                                         const CreateContextRequest& request,
-                                         CreateContextResponse* response) = 0;
-    virtual ::grpc::Status Enqueue(::grpc::ClientContext* context,
-                                   const EnqueueRequest& request,
-                                   EnqueueResponse* response) = 0;
-    virtual ::grpc::Status WaitQueueDone(::grpc::ClientContext* context,
-                                         const WaitQueueDoneRequest& request,
-                                         WaitQueueDoneResponse* response) = 0;
-    virtual ::grpc::Status KeepAlive(::grpc::ClientContext* context,
-                                     const KeepAliveRequest& request,
-                                     KeepAliveResponse* response) = 0;
-    virtual ::grpc::Status CloseContext(::grpc::ClientContext* context,
-                                        const CloseContextRequest& request,
-                                        CloseContextResponse* response) = 0;
-    virtual ::grpc::Status RegisterFunction(
-        ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
-        RegisterFunctionResponse* response) = 0;
-    virtual ::grpc::Status SendTensor(::grpc::ClientContext* context,
-                                      const SendTensorRequest& request,
-                                      SendTensorResponse* response) = 0;
-  };
-  class Stub final : public StubInterface {
-   public:
-    Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
-    ::grpc::Status CreateContext(::grpc::ClientContext* context,
-                                 const CreateContextRequest& request,
-                                 CreateContextResponse* response) override;
-    ::grpc::Status Enqueue(::grpc::ClientContext* context,
-                           const EnqueueRequest& request,
-                           EnqueueResponse* response) override;
-    ::grpc::Status WaitQueueDone(::grpc::ClientContext* context,
-                                 const WaitQueueDoneRequest& request,
-                                 WaitQueueDoneResponse* response) override;
-    ::grpc::Status KeepAlive(::grpc::ClientContext* context,
-                             const KeepAliveRequest& request,
-                             KeepAliveResponse* response) override;
-    ::grpc::Status CloseContext(::grpc::ClientContext* context,
-                                const CloseContextRequest& request,
-                                CloseContextResponse* response) override;
-    ::grpc::Status RegisterFunction(
-        ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
-        RegisterFunctionResponse* response) override;
-    ::grpc::Status SendTensor(::grpc::ClientContext* context,
-                              const SendTensorRequest& request,
-                              SendTensorResponse* response) override;
-
-   private:
-    std::shared_ptr< ::grpc::ChannelInterface> channel_;
-    const ::grpc::internal::RpcMethod rpcmethod_CreateContext_;
-    const ::grpc::internal::RpcMethod rpcmethod_Enqueue_;
-    const ::grpc::internal::RpcMethod rpcmethod_WaitQueueDone_;
-    const ::grpc::internal::RpcMethod rpcmethod_KeepAlive_;
-    const ::grpc::internal::RpcMethod rpcmethod_CloseContext_;
-    const ::grpc::internal::RpcMethod rpcmethod_RegisterFunction_;
-    const ::grpc::internal::RpcMethod rpcmethod_SendTensor_;
-  };
-  static std::unique_ptr<Stub> NewStub(
-      const std::shared_ptr< ::grpc::ChannelInterface>& channel,
-      const ::grpc::StubOptions& options = ::grpc::StubOptions());
-
-  class AsyncService : public ::grpc::Service {
-   public:
-    AsyncService();
-    virtual ~AsyncService();
-    void RequestCreateContext(
-        ::grpc::ServerContext* context, CreateContextRequest* request,
-        ::grpc::ServerAsyncResponseWriter<CreateContextResponse>* response,
-        ::grpc::CompletionQueue* new_call_cq,
-        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
-      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
-                                         new_call_cq, notification_cq, tag);
-    }
-    void RequestEnqueue(
-        ::grpc::ServerContext* context, EnqueueRequest* request,
-        ::grpc::ServerAsyncResponseWriter<EnqueueResponse>* response,
-        ::grpc::CompletionQueue* new_call_cq,
-        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
-      ::grpc::Service::RequestAsyncUnary(1, context, request, response,
-                                         new_call_cq, notification_cq, tag);
-    }
-    void RequestWaitQueueDone(
-        ::grpc::ServerContext* context, WaitQueueDoneRequest* request,
-        ::grpc::ServerAsyncResponseWriter<WaitQueueDoneResponse>* response,
-        ::grpc::CompletionQueue* new_call_cq,
-        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
-      ::grpc::Service::RequestAsyncUnary(2, context, request, response,
-                                         new_call_cq, notification_cq, tag);
-    }
-    void RequestKeepAlive(
-        ::grpc::ServerContext* context, KeepAliveRequest* request,
-        ::grpc::ServerAsyncResponseWriter<KeepAliveResponse>* response,
-        ::grpc::CompletionQueue* new_call_cq,
-        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
-      ::grpc::Service::RequestAsyncUnary(3, context, request, response,
-                                         new_call_cq, notification_cq, tag);
-    }
-    void RequestCloseContext(
-        ::grpc::ServerContext* context, CloseContextRequest* request,
-        ::grpc::ServerAsyncResponseWriter<CloseContextResponse>* response,
-        ::grpc::CompletionQueue* new_call_cq,
-        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
-      ::grpc::Service::RequestAsyncUnary(4, context, request, response,
-                                         new_call_cq, notification_cq, tag);
-    }
-    void RequestRegisterFunction(
-        ::grpc::ServerContext* context, RegisterFunctionRequest* request,
-        ::grpc::ServerAsyncResponseWriter<RegisterFunctionResponse>* response,
-        ::grpc::CompletionQueue* new_call_cq,
-        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
-      ::grpc::Service::RequestAsyncUnary(5, context, request, response,
-                                         new_call_cq, notification_cq, tag);
-    }
-    void RequestSendTensor(
-        ::grpc::ServerContext* context, SendTensorRequest* request,
-        ::grpc::ServerAsyncResponseWriter<SendTensorResponse>* response,
-        ::grpc::CompletionQueue* new_call_cq,
-        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
-      ::grpc::Service::RequestAsyncUnary(6, context, request, response,
-                                         new_call_cq, notification_cq, tag);
-    }
-  };
-};
+// Google internal gRPC generates services under namespace "grpc", but
+// opensource version does not add any additional namespaces.
+// We currently use proto_library BUILD rule with cc_grpc_version and
+// has_services arguments. This rule is deprecated but we can't cleanly migrate
+// to cc_grpc_library rule yet. The internal version takes service_namespace
+// argument, which would have solved the namespace issue, but the external one
+// does not.
+//
+// Creating aliases here to make sure we can access services under namespace
+// "tensorflow::grpc" both in google internal and open-source.
+using ::tensorflow::eager::EagerService;
 
 }  // namespace grpc
-
 }  // namespace eager
 }  // namespace tensorflow
+#endif  // PLATFORM_GOOGLE
 
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index f511674e1f0..7969082c1a7 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -33,14 +33,13 @@ GrpcEagerServiceImpl::GrpcEagerServiceImpl(
 }
 
 void GrpcEagerServiceImpl::HandleRPCsLoop() {
-#define ENQUEUE_REQUEST(method)                                                \
-  do {                                                                         \
-    Call<GrpcEagerServiceImpl,                                                 \
-         tensorflow::eager::grpc::EagerService::AsyncService, method##Request, \
-         method##Response>::                                                   \
-        EnqueueRequest(&service_, cq_.get(),                                   \
-                       &grpc::EagerService::AsyncService::Request##method,     \
-                       &GrpcEagerServiceImpl::method##Handler, false);         \
+#define ENQUEUE_REQUEST(method)                                            \
+  do {                                                                     \
+    Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,           \
+         method##Request, method##Response>::                              \
+        EnqueueRequest(&service_, cq_.get(),                               \
+                       &grpc::EagerService::AsyncService::Request##method, \
+                       &GrpcEagerServiceImpl::method##Handler, false);     \
   } while (0)
   ENQUEUE_REQUEST(CreateContext);
   ENQUEUE_REQUEST(Enqueue);
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 537e9043bdc..edca6478f41 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -43,18 +43,17 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   void Shutdown() override;
 
  private:
-#define HANDLER(method)                                                        \
-  void method##Handler(EagerCall<method##Request, method##Response>* call) {   \
-    env_->compute_pool->Schedule([this, call]() {                              \
-      call->SendResponse(                                                      \
-          ToGrpcStatus(local_impl_.method(&call->request, &call->response)));  \
-    });                                                                        \
-    Call<GrpcEagerServiceImpl,                                                 \
-         tensorflow::eager::grpc::EagerService::AsyncService, method##Request, \
-         method##Response>::                                                   \
-        EnqueueRequest(&service_, cq_.get(),                                   \
-                       &grpc::EagerService::AsyncService::Request##method,     \
-                       &GrpcEagerServiceImpl::method##Handler, false);         \
+#define HANDLER(method)                                                       \
+  void method##Handler(EagerCall<method##Request, method##Response>* call) {  \
+    env_->compute_pool->Schedule([this, call]() {                             \
+      call->SendResponse(                                                     \
+          ToGrpcStatus(local_impl_.method(&call->request, &call->response))); \
+    });                                                                       \
+    Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,              \
+         method##Request, method##Response>::                                 \
+        EnqueueRequest(&service_, cq_.get(),                                  \
+                       &grpc::EagerService::AsyncService::Request##method,    \
+                       &GrpcEagerServiceImpl::method##Handler, false);        \
   }
   HANDLER(CreateContext);
   HANDLER(Enqueue);
@@ -71,7 +70,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
 
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
-  tensorflow::eager::grpc::EagerService::AsyncService service_;
+  grpc::EagerService::AsyncService service_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcEagerServiceImpl);
 };
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 64c221805b0..f3989bfa955 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -306,7 +306,7 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
       task_strings.emplace_back(
           strings::StrCat(id_host_port.first, " -> ", id_host_port.second));
     }
-    return strings::StrCat(job_id_, " -> {", str_util::Join(task_strings, ", "),
+    return strings::StrCat(job_id_, " -> {", absl::StrJoin(task_strings, ", "),
                            "}");
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index c14bfd2155f..3c947509d8c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -459,7 +459,7 @@ Status GrpcSession::ReleaseCallable(CallableHandle handle) {
 class GrpcSessionFactory : public SessionFactory {
  public:
   bool AcceptsOptions(const SessionOptions& options) override {
-    return str_util::StartsWith(options.target, kSchemePrefix);
+    return absl::StartsWith(options.target, kSchemePrefix);
   }
 
   Status NewSession(const SessionOptions& options,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 5de831c8edb..c38b89b9c6f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -648,6 +648,65 @@ TEST(GrpcSessionTest, Error) {
   Env::Default()->SleepForMicroseconds(2000000);
 }
 
+TEST(GrpcSessionTest, ErrorStatusLog) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  const string& master = cluster->targets()[0];
+  const string& dev_a = cluster->devices()[0].name();
+  const string& dev_b = cluster->devices()[1].name();
+  LOG(INFO) << "master " << master << "dev_a " << dev_a << "dev_b " << dev_b;
+  GraphDef gdef;
+  std::vector<string> fetches;
+  {
+    Graph g(OpRegistry::Global());
+
+    // a2 = a + error(a)
+    //
+    // Subgraph for "a" fails. The master will cancel the subgraph for
+    // "b" and then returns the Session::Run.
+    auto a = test::graph::Constant(&g, Tensor());
+    a->set_assigned_device_name(dev_a);
+    auto a_err = test::graph::Error(&g, a, "fantasia!", true);
+    a_err->set_assigned_device_name(dev_a);
+    auto a2 = test::graph::Add(&g, a, a_err);
+    a2->set_assigned_device_name(dev_a);
+    fetches.push_back(a2->name());
+
+    // b2 = b + delay(b)
+    //
+    // Subgraph for "b" sleeps at the node "b_delay". When the sleep
+    // finishes, the subgraph "b" will continue execution till it
+    // notices that it is canceled. Meanwhile, subgraph's executor
+    // and its related state (registered ops) should still be alive.
+    auto b = test::graph::Constant(&g, Tensor());
+    b->set_assigned_device_name(dev_b);
+    auto b_delay = test::graph::Delay(&g, b, Microseconds(1000000));
+    b_delay->set_assigned_device_name(dev_b);
+    auto b2 = test::graph::Add(&g, b, b_delay);
+    b2->set_assigned_device_name(dev_b);
+    fetches.push_back(b2->name());
+    g.ToGraphDef(&gdef);
+  }
+  std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  TF_CHECK_OK(session->Create(gdef));
+  {
+    Status status = session->Run({}, fetches, {}, nullptr);
+    EXPECT_FALSE(status.ok());
+    std::cerr << status << "\n";
+    EXPECT_NE(status.ToString().find("fantasia!"), string::npos);
+    EXPECT_NE(status.ToString().find("ErrorOp: fantasia!"), string::npos);
+  }
+  // session->Close() shall clean up all states related to the session->
+  // E.g., deregisters subgraph with workers, etc.
+  TF_CHECK_OK(session->Close());
+
+  // Sleep a bit so that most of asynchronous works finishes before
+  // the test process finishes.
+  Env::Default()->SleepForMicroseconds(2000000);
+}
+
 TEST(GrpcSessionTest, LongErrorMessage) {
   std::unique_ptr<test::TestCluster> cluster;
   TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 1abfdf9d096..a7f11883fc4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -99,6 +99,9 @@ class RPCState : public GrpcClientCQTag {
     if (call_opts_) {
       call_opts_->ClearCancelCallback();
     }
+
+    VLOG(2) << "Completed call: " << method_;
+
     Status s = FromGrpcStatus(status_);
     if (s.ok() && !ok) {
       // Since this function is only being used for processing the response
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
index e52b2574117..00b32d3a5ee 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
@@ -67,7 +67,7 @@ Status FillServerDef(const string& cluster_spec, const string& job_name,
       my_num_tasks = host_ports.size();
     }
     LOG(INFO) << "Peer " << job_name << " " << num_tasks << " {"
-              << str_util::Join(host_ports, ", ") << "}";
+              << absl::StrJoin(host_ports, ", ") << "}";
   }
   if (my_num_tasks == 0) {
     return errors::InvalidArgument("Job name \"", options->job_name(),
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index 215e9f2668a..9b118ce8363 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -46,7 +46,7 @@ Status TestCluster::MakeTestCluster(const string& binary_path,
   }
 
   const string tf_jobs = strings::StrCat("--tf_jobs=localhost|",
-                                         str_util::Join(ret->targets_, ";"));
+                                         absl::StrJoin(ret->targets_, ";"));
 
   int num_cpus = 1;
   int num_gpus = 0;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
index 33cbadda0a1..9a6de2c4036 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
@@ -61,7 +61,7 @@ Status FillServerDef(const string& job_spec, const string& job_name,
       my_tasks_per_replica = tasks_per_replica;
     }
     LOG(INFO) << "Peer " << job_def->name() << " " << tasks_per_replica << " {"
-              << str_util::Join(host_ports, ", ") << "}";
+              << absl::StrJoin(host_ports, ", ") << "}";
   }
   if (my_tasks_per_replica == 0) {
     return errors::InvalidArgument("Invalid job specification");
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 9fa7328b8ac..0e60c1e70ae 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
-
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -442,6 +442,7 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
                                      const RecvTensorRequest* request,
                                      ::grpc::ByteBuffer* response,
                                      StatusCallback done) {
+  VLOG(1) << "GrpcRecvTensorAsync req: " << request->DebugString();
   const int64 request_id = request->request_id();
   const int64 step_id = request->step_id();
 
@@ -544,8 +545,8 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
                 delete copy;
               };
 
-              send_dev_context->CopyDeviceTensorToCPU(
-                  &val, request->rendezvous_key(), src_dev, copy, copy_ready);
+              CopyDeviceToHost(&val, alloc, alloc, request->rendezvous_key(),
+                               src_dev, copy, send_dev_context, copy_ready);
               return;
             }
           }
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index fe9369e884b..62a2011db39 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -64,7 +64,7 @@ Status ServerFactory::GetFactory(const ServerDef& server_def,
   return errors::NotFound(
       "No server factory registered for the given ServerDef: ",
       server_def.DebugString(), "\nThe available server factories are: [ ",
-      str_util::Join(server_names, ", "), " ]");
+      absl::StrJoin(server_names, ", "), " ]");
 }
 
 // Creates a server based on the given `server_def`, and stores it in
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
index 460372523c9..77048c24b47 100644
--- a/tensorflow/core/distributed_runtime/server_lib_test.cc
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -46,11 +46,11 @@ TEST(ServerLibTest, NewServerNoFactoriesAccept) {
   std::unique_ptr<ServerInterface> server;
   Status s = NewServer(server_def, &server);
   ASSERT_NE(s, Status::OK());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.error_message(),
       "No server factory registered for the given ServerDef"));
-  EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                    "The available server factories are: ["));
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "The available server factories are: ["));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index edbd7ddbcb2..2c2c0277fd6 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -154,7 +154,7 @@ TEST_F(SessionMgrTest, UnknownSessionHandle) {
   Status s = mgr_.WorkerSessionForSession(session_handle, &session);
   EXPECT_TRUE(errors::IsAborted(s));
   EXPECT_TRUE(
-      str_util::StrContains(s.error_message(), "Session handle is not found"));
+      absl::StrContains(s.error_message(), "Session handle is not found"));
 }
 
 TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 47c06514fc1..cfa61916444 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -28,7 +28,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-Worker::Worker(WorkerEnv* env) : env_(env), recent_request_ids_(100000) {}
+Worker::Worker(WorkerEnv* env) : env_(env), recent_request_ids_(100000) {
+  // Enable log history collection in StatusGroup so that recent warning and
+  // error log messages will be attached to the root error status to be
+  // forwarded to the master.
+  StatusGroup::ConfigureLogHistory();
+}
 
 void Worker::GetStatusAsync(const GetStatusRequest* request,
                             GetStatusResponse* response, StatusCallback done) {
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 7ab87d1f8a4..1f1d29f6394 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -42,38 +42,9 @@ constexpr size_t Allocator::kAllocatorAlignment;
 
 Allocator::~Allocator() {}
 
-// If true, cpu allocator collects more stats.
-static bool cpu_allocator_collect_stats = false;
 // If true, cpu allocator collects full stats.
 static bool cpu_allocator_collect_full_stats = false;
 
-// Individual allocations large than this amount will trigger a warning.
-static const double kLargeAllocationWarningThreshold = 0.1;
-
-// If cpu_allocator_collect_stats is true, warn when the total allocated memory
-// exceeds this threshold.
-static const double kTotalAllocationWarningThreshold = 0.5;
-
-static const int kMaxSingleAllocationWarnings = 5;
-static const int kMaxTotalAllocationWarnings = 1;
-
-// Cache first invocation to port::AvailableRam, as it can be expensive.
-static int64_t LargeAllocationWarningBytes() {
-  static int64_t value = static_cast<int64>(port::AvailableRam() *
-                                            kLargeAllocationWarningThreshold);
-  return value;
-}
-
-static int64_t TotalAllocationWarningBytes() {
-  static int64_t value = static_cast<int64>(port::AvailableRam() *
-                                            kTotalAllocationWarningThreshold);
-  return value;
-}
-
-void EnableCPUAllocatorStats(bool enable) {
-  cpu_allocator_collect_stats = enable;
-}
-bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
 void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
@@ -85,120 +56,6 @@ string AllocatorAttributes::DebugString() const {
                          " gpu_compatible=", gpu_compatible(), ")");
 }
 
-namespace {
-// A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
-// return a different version that may perform better, but may also lack the
-// optional stats triggered by the functions above.  TODO(tucker): migrate all
-// uses of cpu_allocator() except tests to use ProcessState instead.
-class CPUAllocator : public Allocator {
- public:
-  CPUAllocator()
-      : single_allocation_warning_count_(0),
-        total_allocation_warning_count_(0) {}
-
-  ~CPUAllocator() override {}
-
-  string Name() override { return "cpu"; }
-
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    if (num_bytes > LargeAllocationWarningBytes() &&
-        single_allocation_warning_count_ < kMaxSingleAllocationWarnings) {
-      ++single_allocation_warning_count_;
-      LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
-                   << 100 * kLargeAllocationWarningThreshold
-                   << "% of system memory.";
-    }
-
-    void* p = port::AlignedMalloc(num_bytes, alignment);
-    if (cpu_allocator_collect_stats) {
-      const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
-      mutex_lock l(mu_);
-      ++stats_.num_allocs;
-      stats_.bytes_in_use += alloc_size;
-      stats_.peak_bytes_in_use =
-          std::max<int64>(stats_.peak_bytes_in_use, stats_.bytes_in_use);
-      stats_.largest_alloc_size =
-          std::max<int64>(stats_.largest_alloc_size, alloc_size);
-
-      if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
-          total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
-        ++total_allocation_warning_count_;
-        LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
-                     << "exceeds " << 100 * kTotalAllocationWarningThreshold
-                     << "% of system memory";
-      }
-    }
-    return p;
-  }
-
-  void DeallocateRaw(void* ptr) override {
-    if (cpu_allocator_collect_stats) {
-      const std::size_t alloc_size =
-          port::MallocExtension_GetAllocatedSize(ptr);
-      mutex_lock l(mu_);
-      stats_.bytes_in_use -= alloc_size;
-    }
-    port::AlignedFree(ptr);
-  }
-
-  absl::optional<AllocatorStats> GetStats() override {
-    mutex_lock l(mu_);
-    return stats_;
-  }
-
-  void ClearStats() override {
-    mutex_lock l(mu_);
-    stats_.num_allocs = 0;
-    stats_.peak_bytes_in_use = stats_.bytes_in_use;
-    stats_.largest_alloc_size = 0;
-  }
-
-  size_t AllocatedSizeSlow(const void* ptr) const override {
-    return port::MallocExtension_GetAllocatedSize(ptr);
-  }
-
- private:
-  mutex mu_;
-  AllocatorStats stats_ GUARDED_BY(mu_);
-
-  // Use <atomic> for single allocations to avoid mutex contention when
-  // statistics are disabled.
-  std::atomic<int> single_allocation_warning_count_;
-  int total_allocation_warning_count_ GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
-};
-
-class CPUAllocatorFactory : public AllocatorFactory {
- public:
-  Allocator* CreateAllocator() override { return new CPUAllocator; }
-
-  SubAllocator* CreateSubAllocator(int numa_node) override {
-    return new CPUSubAllocator(new CPUAllocator);
-  }
-
- private:
-  class CPUSubAllocator : public SubAllocator {
-   public:
-    explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
-        : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {}
-
-    void* Alloc(size_t alignment, size_t num_bytes) override {
-      return cpu_allocator_->AllocateRaw(alignment, num_bytes);
-    }
-
-    void Free(void* ptr, size_t num_bytes) override {
-      cpu_allocator_->DeallocateRaw(ptr);
-    }
-
-   private:
-    CPUAllocator* cpu_allocator_;
-  };
-};
-
-REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocatorFactory);
-}  // namespace
-
 Allocator* cpu_allocator_base() {
   static Allocator* cpu_alloc =
       AllocatorFactoryRegistry::singleton()->GetAllocator();
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index c6483e4b7fe..7320ca8772c 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/attr_value.pb_text.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -183,7 +184,7 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
 }
 
 string SummarizeString(const string& str) {
-  string escaped = str_util::CEscape(str);
+  string escaped = absl::CEscape(str);
 
   // If the string is long, replace the middle with ellipses.
   constexpr int kMaxStringSummarySize = 80;
@@ -214,7 +215,7 @@ string SummarizeFunc(const NameAttrList& func) {
         strings::StrCat(p.first, "=", SummarizeAttrValue(p.second)));
   }
   std::sort(entries.begin(), entries.end());
-  return strings::StrCat(func.name(), "[", str_util::Join(entries, ", "), "]");
+  return strings::StrCat(func.name(), "[", absl::StrJoin(entries, ", "), "]");
 }
 
 }  // namespace
@@ -276,7 +277,7 @@ string SummarizeAttrValue(const AttrValue& attr_value) {
         pieces.erase(pieces.begin() + 5, pieces.begin() + (pieces.size() - 6));
         pieces[5] = "...";
       }
-      return strings::StrCat("[", str_util::Join(pieces, ", "), "]");
+      return strings::StrCat("[", absl::StrJoin(pieces, ", "), "]");
     }
     case AttrValue::kFunc: {
       return SummarizeFunc(attr_value.func());
@@ -335,7 +336,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   // check if has_list is false and some other field in attr_value is
   // set to flag the error.  This test can be made more strict once
   // support for GraphDef versions <= 4 is dropped.
-  if (str_util::StartsWith(type, "list(") && !attr_value.has_list()) {
+  if (absl::StartsWith(type, "list(") && !attr_value.has_list()) {
     if (num_set) {
       return errors::InvalidArgument(
           "AttrValue missing value with expected type '", type, "'");
@@ -346,7 +347,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   }
 
   // Okay to have an empty list, but not to be missing a non-list value.
-  if (num_set == 0 && !str_util::StartsWith(type, "list(")) {
+  if (num_set == 0 && !absl::StartsWith(type, "list(")) {
     return errors::InvalidArgument(
         "AttrValue missing value with expected type '", type, "'");
   }
@@ -390,29 +391,29 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
 bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out) {
   // Parse type.
   string field_name;
-  bool is_list = str_util::ConsumePrefix(&type, "list(");
-  if (str_util::ConsumePrefix(&type, "string")) {
+  bool is_list = absl::ConsumePrefix(&type, "list(");
+  if (absl::ConsumePrefix(&type, "string")) {
     field_name = "s";
-  } else if (str_util::ConsumePrefix(&type, "int")) {
+  } else if (absl::ConsumePrefix(&type, "int")) {
     field_name = "i";
-  } else if (str_util::ConsumePrefix(&type, "float")) {
+  } else if (absl::ConsumePrefix(&type, "float")) {
     field_name = "f";
-  } else if (str_util::ConsumePrefix(&type, "bool")) {
+  } else if (absl::ConsumePrefix(&type, "bool")) {
     field_name = "b";
-  } else if (str_util::ConsumePrefix(&type, "type")) {
+  } else if (absl::ConsumePrefix(&type, "type")) {
     field_name = "type";
-  } else if (str_util::ConsumePrefix(&type, "shape")) {
+  } else if (absl::ConsumePrefix(&type, "shape")) {
     field_name = "shape";
-  } else if (str_util::ConsumePrefix(&type, "tensor")) {
+  } else if (absl::ConsumePrefix(&type, "tensor")) {
     field_name = "tensor";
-  } else if (str_util::ConsumePrefix(&type, "func")) {
+  } else if (absl::ConsumePrefix(&type, "func")) {
     field_name = "func";
-  } else if (str_util::ConsumePrefix(&type, "placeholder")) {
+  } else if (absl::ConsumePrefix(&type, "placeholder")) {
     field_name = "placeholder";
   } else {
     return false;
   }
-  if (is_list && !str_util::ConsumePrefix(&type, ")")) {
+  if (is_list && !absl::ConsumePrefix(&type, ")")) {
     return false;
   }
 
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index 18b1e306fa3..41cc4e45e73 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/collective.h"
 
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -49,7 +50,7 @@ std::vector<RegistrationInfo>* MutableCollectiveRegistry() {
 
 string CollGroupRuntimeDetails::ToString() const {
   return strings::StrCat("CollGroupRuntimeDetails {communicator_key=",
-                         str_util::CEscape(communicator_key), "}");
+                         absl::CEscape(communicator_key), "}");
 }
 
 string CollGroupParams::ToString() const {
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index b4fdf8ec76a..aafe2a63819 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -874,6 +874,12 @@ Status FusedBatchNormShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status FusedBatchNormV3Shape(shape_inference::InferenceContext* c) {
+  TF_RETURN_IF_ERROR(FusedBatchNormShape(c));
+  c->set_output(5, c->UnknownShape());
+  return Status::OK();
+}
+
 Status FusedBatchNormGradShape(shape_inference::InferenceContext* c) {
   ShapeHandle y_backprop;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &y_backprop));
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index dfd5a12d2d7..1712dc721ec 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -255,6 +255,9 @@ Status AvgPoolShape(shape_inference::InferenceContext* c);
 // Shape function for FusedBatchNorm and FusedBatchNormV2 operations.
 Status FusedBatchNormShape(shape_inference::InferenceContext* c);
 
+// Shape function for FusedBatchNormV3 operations.
+Status FusedBatchNormV3Shape(shape_inference::InferenceContext* c);
+
 // Shape function for FusedBatchNormGrad and FusedBatchNormGradV2 operations.
 Status FusedBatchNormGradShape(shape_inference::InferenceContext* c);
 
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 7202ccb3b44..37994c9be34 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -141,7 +141,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 1"));
   }
 
@@ -161,7 +161,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5}), S({3, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.ToString(),
         "Invalid argument: Dimensions must be equal, but are 5 and 3"));
   }
@@ -172,7 +172,7 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5, 3}), S({3, 5, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 3"));
   }
 
diff --git a/tensorflow/core/framework/cpu_allocator_impl.cc b/tensorflow/core/framework/cpu_allocator_impl.cc
new file mode 100644
index 00000000000..e2bab9900e1
--- /dev/null
+++ b/tensorflow/core/framework/cpu_allocator_impl.cc
@@ -0,0 +1,176 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <atomic>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/framework/tracking_allocator.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// If true, cpu allocator collects more stats.
+static bool cpu_allocator_collect_stats = false;
+
+void EnableCPUAllocatorStats(bool enable) {
+  cpu_allocator_collect_stats = enable;
+}
+bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
+
+static const int kMaxTotalAllocationWarnings = 1;
+
+static const int kMaxSingleAllocationWarnings = 5;
+
+// If cpu_allocator_collect_stats is true, warn when the total allocated memory
+// exceeds this threshold.
+static const double kTotalAllocationWarningThreshold = 0.5;
+
+// Individual allocations large than this amount will trigger a warning.
+static const double kLargeAllocationWarningThreshold = 0.1;
+
+// Cache first invocation to port::AvailableRam, as it can be expensive.
+static int64_t LargeAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kLargeAllocationWarningThreshold);
+  return value;
+}
+
+static int64_t TotalAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kTotalAllocationWarningThreshold);
+  return value;
+}
+
+namespace {
+
+// A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
+// return a different version that may perform better, but may also lack the
+// optional stats triggered by the functions above.  TODO(tucker): migrate all
+// uses of cpu_allocator() except tests to use ProcessState instead.
+class CPUAllocator : public Allocator {
+ public:
+  CPUAllocator()
+      : single_allocation_warning_count_(0),
+        total_allocation_warning_count_(0) {}
+
+  ~CPUAllocator() override {}
+
+  string Name() override { return "cpu"; }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    if (num_bytes > LargeAllocationWarningBytes() &&
+        single_allocation_warning_count_ < kMaxSingleAllocationWarnings) {
+      ++single_allocation_warning_count_;
+      LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
+                   << 100 * kLargeAllocationWarningThreshold
+                   << "% of system memory.";
+    }
+
+    void* p = port::AlignedMalloc(num_bytes, alignment);
+    if (cpu_allocator_collect_stats) {
+      const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
+      mutex_lock l(mu_);
+      ++stats_.num_allocs;
+      stats_.bytes_in_use += alloc_size;
+      stats_.peak_bytes_in_use =
+          std::max<int64>(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+      stats_.largest_alloc_size =
+          std::max<int64>(stats_.largest_alloc_size, alloc_size);
+
+      if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
+          total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
+        ++total_allocation_warning_count_;
+        LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
+                     << "exceeds " << 100 * kTotalAllocationWarningThreshold
+                     << "% of system memory";
+      }
+    }
+    return p;
+  }
+
+  void DeallocateRaw(void* ptr) override {
+    if (cpu_allocator_collect_stats) {
+      const std::size_t alloc_size =
+          port::MallocExtension_GetAllocatedSize(ptr);
+      mutex_lock l(mu_);
+      stats_.bytes_in_use -= alloc_size;
+    }
+    port::AlignedFree(ptr);
+  }
+
+  absl::optional<AllocatorStats> GetStats() override {
+    mutex_lock l(mu_);
+    return stats_;
+  }
+
+  void ClearStats() override {
+    mutex_lock l(mu_);
+    stats_.num_allocs = 0;
+    stats_.peak_bytes_in_use = stats_.bytes_in_use;
+    stats_.largest_alloc_size = 0;
+  }
+
+  size_t AllocatedSizeSlow(const void* ptr) const override {
+    return port::MallocExtension_GetAllocatedSize(ptr);
+  }
+
+ private:
+  mutex mu_;
+  AllocatorStats stats_ GUARDED_BY(mu_);
+
+  // Use <atomic> for single allocations to avoid mutex contention when
+  // statistics are disabled.
+  std::atomic<int> single_allocation_warning_count_;
+  int total_allocation_warning_count_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
+};
+
+class CPUAllocatorFactory : public AllocatorFactory {
+ public:
+  Allocator* CreateAllocator() override { return new CPUAllocator; }
+
+  SubAllocator* CreateSubAllocator(int numa_node) override {
+    return new CPUSubAllocator(new CPUAllocator);
+  }
+
+ private:
+  class CPUSubAllocator : public SubAllocator {
+   public:
+    explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
+        : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {}
+
+    void* Alloc(size_t alignment, size_t num_bytes) override {
+      return cpu_allocator_->AllocateRaw(alignment, num_bytes);
+    }
+
+    void Free(void* ptr, size_t num_bytes) override {
+      cpu_allocator_->DeallocateRaw(ptr);
+    }
+
+   private:
+    CPUAllocator* cpu_allocator_;
+  };
+};
+
+REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocatorFactory);
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index aab0397ea25..c69097c8e64 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
@@ -289,8 +289,7 @@ class FunctionInstantiationHelper {
       // must lie in the range [node_name, node_colon_bound).
       auto it = index_.lower_bound(node_name);
       while (it != index_.end() && it->first <= node_colon_bound) {
-        if (it->first == node_name ||
-            tensorflow::str_util::StartsWith(it->first, node_colon)) {
+        if (it->first == node_name || absl::StartsWith(it->first, node_colon)) {
           nid = it->second.nid;
           break;
         }
@@ -498,7 +497,7 @@ string Print(const AttrValue& attr_value) {
     }
     std::sort(entries.begin(), entries.end());
     return strings::StrCat(attr_value.func().name(), "[",
-                           str_util::Join(entries, ", "), "]");
+                           absl::StrJoin(entries, ", "), "]");
   }
   return SummarizeAttrValue(attr_value);
 }
@@ -523,21 +522,21 @@ string Print(const NodeDef& n) {
         entries.push_back("device=<FAILED_TO_PARSE>");
       }
     }
-    strings::StrAppend(&out, "[", str_util::Join(entries, ", "), "]");
+    strings::StrAppend(&out, "[", absl::StrJoin(entries, ", "), "]");
   }
   strings::StrAppend(&out, "(");
   std::vector<StringPiece> dat;
   std::vector<string> dep;
   for (StringPiece s : n.input()) {
-    if (str_util::ConsumePrefix(&s, "^")) {
+    if (absl::ConsumePrefix(&s, "^")) {
       dep.emplace_back(s);
     } else {
       dat.push_back(s);
     }
   }
-  strings::StrAppend(&out, str_util::Join(dat, ", "), ")");
+  strings::StrAppend(&out, absl::StrJoin(dat, ", "), ")");
   if (!dep.empty()) {
-    strings::StrAppend(&out, " @ ", str_util::Join(dep, ", "));
+    strings::StrAppend(&out, " @ ", absl::StrJoin(dep, ", "));
   }
   return out;
 }
@@ -901,27 +900,27 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
   }
   if (!options.target.empty()) {
     entries.push_back(
-        strings::StrCat("_target", "=", str_util::CEscape(options.target)));
+        strings::StrCat("_target", "=", absl::CEscape(options.target)));
   }
   for (int i = 0; i < options.input_devices.size(); ++i) {
-    entries.push_back(strings::StrCat(
-        "_input_dev", i, "=", str_util::CEscape(options.input_devices[i])));
+    entries.push_back(strings::StrCat("_input_dev", i, "=",
+                                      absl::CEscape(options.input_devices[i])));
   }
   for (int i = 0; i < options.output_devices.size(); ++i) {
     entries.push_back(strings::StrCat(
-        "_output_dev", i, "=", str_util::CEscape(options.output_devices[i])));
+        "_output_dev", i, "=", absl::CEscape(options.output_devices[i])));
   }
   for (const auto& iter : options.input_tensor_shapes) {
     entries.push_back(
         strings::StrCat("_input_tensor_shape", iter.first, "=",
-                        str_util::CEscape(iter.second.DebugString())));
+                        absl::CEscape(iter.second.DebugString())));
   }
   for (const auto& iter : options.input_resource_dtypes_and_shapes) {
     entries.push_back(strings::StrCat("_input_resource_dtype", iter.first, "=",
                                       DataTypeString(iter.second.first)));
     entries.push_back(
         strings::StrCat("_input_resource_shape", iter.first, "=",
-                        str_util::CEscape(iter.second.second.DebugString())));
+                        absl::CEscape(iter.second.second.DebugString())));
   }
   if (options.lib_def) {
     entries.push_back(strings::StrCat(
@@ -938,11 +937,11 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
   string config_proto_serialized;
   options.config_proto.SerializeToString(&config_proto_serialized);
   if (!config_proto_serialized.empty()) {
-    entries.push_back(strings::StrCat(
-        "_config_proto", "=", str_util::CEscape(config_proto_serialized)));
+    entries.push_back(strings::StrCat("_config_proto", "=",
+                                      absl::CEscape(config_proto_serialized)));
   }
   std::sort(entries.begin(), entries.end());
-  return strings::StrCat(funcname, "[", str_util::Join(entries, ","), "]");
+  return strings::StrCat(funcname, "[", absl::StrJoin(entries, ","), "]");
 }
 
 FunctionCallFrame::FunctionCallFrame(DataTypeSlice arg_types,
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 9e5954bba6c..c54b9b30cf8 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -556,7 +556,7 @@ TEST(TFunc, IntsOnDeviceArgSet) {
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 4cc8d12bb18..f66b0fd733c 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -259,6 +259,22 @@ FunctionDef XAddX() {
       });
 }
 
+FunctionDef XAddY() {
+  return FDH::Define(
+      // Name
+      "XAddY",
+      // Args
+      {"x: T", "y: T"},
+      // Return values
+      {"z: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"z"}, "Add", {"x", "y"}, {{"T", "$T"}}},
+      });
+}
+
 FunctionDef XTimesTwoInt32() {
   const Tensor kTwo = test::AsScalar<int64>(2);
   return FDH::Define(
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index 9893d1dbc8d..50902530a13 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -81,6 +81,9 @@ FunctionDef FuncWithListOutput();
 // x:T -> x + x.
 FunctionDef XAddX();
 
+// x: T, y:T -> x + y.
+FunctionDef XAddY();
+
 // x:T -> x * 2, where x is int32.
 FunctionDef XTimesTwoInt32();
 
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index 56102db30ee..50a60e0087e 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -95,7 +95,7 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
   std::vector<string> to_remove;
   for (const auto& attr : node_def->attr()) {
     // If the attr is not in consumer_op_def and doesn't start with '_'...
-    if (!str_util::StartsWith(attr.first, "_") &&
+    if (!absl::StartsWith(attr.first, "_") &&
         FindAttr(attr.first, *consumer_op_def) == nullptr) {
       const OpDef::AttrDef* producer_attr_def =
           FindAttr(attr.first, *producer_op_def);
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index 664e8e272cb..0aff1edc97a 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -249,7 +249,7 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
   for (int n_index = 0; n_index < fdef->node_def_size(); ++n_index) {
     NodeDef* node_def = fdef->mutable_node_def(n_index);
     for (int i = 0; i < node_def->input_size(); ++i) {
-      if (str_util::StartsWith(node_def->input(i), "^")) {
+      if (absl::StartsWith(node_def->input(i), "^")) {
         // Control input
         const string normalized =
             node_names.Renormalize(node_def->input(i).substr(1));
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 6e58935d505..1d482184a52 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -137,7 +137,7 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
   MemoryTypesHelper(out_names, &host_memory_args, out_mtypes);
   if (!host_memory_args.empty()) {
     return errors::InvalidArgument(
-        "HostMemory args '", str_util::Join(host_memory_args, "', '"),
+        "HostMemory args '", absl::StrJoin(host_memory_args, "', '"),
         "' not found in OpDef: ", SummarizeOpDef(*op_def));
   }
   CHECK_LE(inp_mtypes->size(), inp_dtypes.size());
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index e132095ff53..fb694161c6a 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 
 #include "absl/time/clock.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 349636241e8..9aab9f1ccfc 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -210,7 +210,7 @@ class Node {
       processing_time_ += time_nanos - iter->second;
       work_start_.erase(iter);
     } else {
-      LOG(WARNING)
+      VLOG(1)
           << "Encountered a stop event that was not preceded by a start event.";
     }
   }
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index 4808967ca6a..6a25114e6dc 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -238,7 +238,7 @@ Status NodeDefBuilder::Finalize(NodeDef* node_def) const {
       return errors::InvalidArgument(
           errors_ptr->size(), " errors while building NodeDef '",
           node_def_.name(), "' using ", SummarizeOpDef(*op_def_), ":\n",
-          str_util::Join(*errors_ptr, "\n"));
+          absl::StrJoin(*errors_ptr, "\n"));
     }
   } else {
     NodeDef node_def_backup;
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index 03b4456393f..7c4426e276a 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -83,7 +83,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
     for (const string& message : messages) {
-      EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
+      EXPECT_TRUE(absl::StrContains(status.error_message(), message))
           << status << ", " << message;
     }
   }
@@ -104,7 +104,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     }
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
-    EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
+    EXPECT_TRUE(absl::StrContains(status.error_message(), message))
         << "Actual error: " << status.error_message()
         << "\nDoes not contain: " << message;
   }
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 448d3bdea37..681ce8bba5c 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -203,7 +203,7 @@ Status AttrSlice::Find(StringPiece attr_name,
   // Skip AttachDef for internal attrs since it is a little bit
   // expensive and it is common for them to correctly not be included
   // in a NodeDef.
-  if (!str_util::StartsWith(attr_name, "_") && ndef_ != nullptr) {
+  if (!absl::StartsWith(attr_name, "_") && ndef_ != nullptr) {
     s = AttachDef(s, *ndef_);
   }
   return s;
@@ -500,7 +500,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   size_t num_inputs = 0;
   // TODO(josh11b): Unify the input field validation.
   for (const string& input : node_def.input()) {
-    if (str_util::StartsWith(input, "^")) {
+    if (absl::StartsWith(input, "^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
         return errors::InvalidArgument("Control input '", input,
@@ -526,7 +526,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   }
   for (const auto& attr : node_def.attr()) {
     // Allow internal optional attributes with names starting with "_".
-    if (str_util::StartsWith(attr.first, "_")) {
+    if (absl::StartsWith(attr.first, "_")) {
       continue;
     }
     auto iter = op_attrs.find(attr.first);
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 36f6e337295..4c4f0e2f37a 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -68,7 +68,7 @@ void ExpectFailure(const NodeDef& bad, const OpDef& op_def,
       << "; OpDef: " << SummarizeOpDef(op_def);
 
   LOG(INFO) << "Message: " << status.error_message();
-  EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
+  EXPECT_TRUE(absl::StrContains(status.ToString(), message))
       << "NodeDef: " << SummarizeNodeDef(bad)
       << "; OpDef: " << SummarizeOpDef(op_def) << "\nActual error: " << status
       << "\nDoes not contain: " << message;
@@ -270,7 +270,7 @@ void ExpectInvalidSyntax(const NodeDef& bad, const string& message) {
   EXPECT_TRUE(errors::IsInvalidArgument(status))
       << status << "; NodeDef: " << SummarizeNodeDef(bad);
 
-  EXPECT_TRUE(str_util::StrContains(StringPiece(status.ToString()), message))
+  EXPECT_TRUE(absl::StrContains(StringPiece(status.ToString()), message))
       << "NodeDef: " << SummarizeNodeDef(bad) << ", " << status << ", "
       << message;
 }
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index b29d7ae77f0..4fafa56e7ac 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -162,7 +162,7 @@ void OpRegistry::Export(bool include_internal, OpList* ops) const {
   out->Reserve(sorted.size());
 
   for (const auto& item : sorted) {
-    if (include_internal || !str_util::StartsWith(item.first, "_")) {
+    if (include_internal || !absl::StartsWith(item.first, "_")) {
       *out->Add() = item.second->op_def;
     }
   }
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index 140f2010857..dc931c38cd5 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -97,7 +97,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeOpDef(old_op_def) << " vs. "
                     << SummarizeOpDef(new_op_def);
     } else {
-      EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
+      EXPECT_TRUE(absl::StrContains(status.error_message(), error))
           << status << " does not contain " << error;
     }
   }
@@ -118,8 +118,7 @@ class OpCompatibilityTest : public OpsTestBase {
     if (status.ok()) {
       ADD_FAILURE() << SummarizeNodeDef(*node_def());
     } else {
-      EXPECT_TRUE(
-          str_util::StrContains(status.error_message(), validation_error))
+      EXPECT_TRUE(absl::StrContains(status.error_message(), validation_error))
           << status << " does not contain " << validation_error;
     }
 
@@ -180,7 +179,7 @@ class OpCompatibilityTest : public OpsTestBase {
                     << SummarizeOpDef(*new_op_def);
     } else {
       EXPECT_TRUE(
-          str_util::StrContains(status.error_message(), compatibility_error))
+          absl::StrContains(status.error_message(), compatibility_error))
           << status << " does not contain " << compatibility_error;
     }
   }
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 69af1fa214a..0eda2c6492f 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
@@ -112,11 +114,11 @@ bool ConsumeAttrNumber(StringPiece* sp, int64* out) {
 
 bool ConsumeCompoundAttrType(StringPiece* sp, StringPiece* out) {
   auto capture_begin = sp->begin();
-  if (str_util::ConsumePrefix(sp, "numbertype") ||
-      str_util::ConsumePrefix(sp, "numerictype") ||
-      str_util::ConsumePrefix(sp, "quantizedtype") ||
-      str_util::ConsumePrefix(sp, "realnumbertype") ||
-      str_util::ConsumePrefix(sp, "realnumberictype")) {
+  if (absl::ConsumePrefix(sp, "numbertype") ||
+      absl::ConsumePrefix(sp, "numerictype") ||
+      absl::ConsumePrefix(sp, "quantizedtype") ||
+      absl::ConsumePrefix(sp, "realnumbertype") ||
+      absl::ConsumePrefix(sp, "realnumberictype")) {
     *out = StringPiece(capture_begin, sp->begin() - capture_begin);
     return true;
   }
@@ -157,32 +159,32 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   bool is_list = ConsumeListPrefix(&spec);
   string type;
   StringPiece type_string;  // Used if type == "type"
-  if (str_util::ConsumePrefix(&spec, "string")) {
+  if (absl::ConsumePrefix(&spec, "string")) {
     type = "string";
-  } else if (str_util::ConsumePrefix(&spec, "int")) {
+  } else if (absl::ConsumePrefix(&spec, "int")) {
     type = "int";
-  } else if (str_util::ConsumePrefix(&spec, "float")) {
+  } else if (absl::ConsumePrefix(&spec, "float")) {
     type = "float";
-  } else if (str_util::ConsumePrefix(&spec, "bool")) {
+  } else if (absl::ConsumePrefix(&spec, "bool")) {
     type = "bool";
-  } else if (str_util::ConsumePrefix(&spec, "type")) {
+  } else if (absl::ConsumePrefix(&spec, "type")) {
     type = "type";
-  } else if (str_util::ConsumePrefix(&spec, "shape")) {
+  } else if (absl::ConsumePrefix(&spec, "shape")) {
     type = "shape";
-  } else if (str_util::ConsumePrefix(&spec, "tensor")) {
+  } else if (absl::ConsumePrefix(&spec, "tensor")) {
     type = "tensor";
-  } else if (str_util::ConsumePrefix(&spec, "func")) {
+  } else if (absl::ConsumePrefix(&spec, "func")) {
     type = "func";
   } else if (ConsumeCompoundAttrType(&spec, &type_string)) {
     type = "type";
     AttrValue* allowed = attr->mutable_allowed_values();
     VERIFY(ProcessCompoundType(type_string, allowed),
            "Expected to see a compound type, saw: ", type_string);
-  } else if (str_util::ConsumePrefix(&spec, "{")) {
+  } else if (absl::ConsumePrefix(&spec, "{")) {
     // e.g. "{ int32, float, bool }" or "{ \"foo\", \"bar\" }"
     AttrValue* allowed = attr->mutable_allowed_values();
     str_util::RemoveLeadingWhitespace(&spec);
-    if (str_util::StartsWith(spec, "\"") || str_util::StartsWith(spec, "'")) {
+    if (absl::StartsWith(spec, "\"") || absl::StartsWith(spec, "'")) {
       type = "string";  // "{ \"foo\", \"bar\" }" or "{ 'foo', 'bar' }"
       while (true) {
         StringPiece escaped_string;
@@ -191,16 +193,16 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                "Trouble parsing allowed string at '", spec, "'");
         string unescaped;
         string error;
-        VERIFY(str_util::CUnescape(escaped_string, &unescaped, &error),
+        VERIFY(absl::CUnescape(escaped_string, &unescaped, &error),
                "Trouble unescaping \"", escaped_string,
                "\", got error: ", error);
         allowed->mutable_list()->add_s(unescaped);
-        if (str_util::ConsumePrefix(&spec, ",")) {
+        if (absl::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (str_util::ConsumePrefix(&spec, "}"))
+          if (absl::ConsumePrefix(&spec, "}"))
             break;  // Allow ending with ", }".
         } else {
-          VERIFY(str_util::ConsumePrefix(&spec, "}"),
+          VERIFY(absl::ConsumePrefix(&spec, "}"),
                  "Expected , or } after strings in list, not: '", spec, "'");
           break;
         }
@@ -218,12 +220,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                  "Unrecognized type string '", type_string, "'");
           allowed->mutable_list()->add_type(dt);
         }
-        if (str_util::ConsumePrefix(&spec, ",")) {
+        if (absl::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (str_util::ConsumePrefix(&spec, "}"))
+          if (absl::ConsumePrefix(&spec, "}"))
             break;  // Allow ending with ", }".
         } else {
-          VERIFY(str_util::ConsumePrefix(&spec, "}"),
+          VERIFY(absl::ConsumePrefix(&spec, "}"),
                  "Expected , or } after types in list, not: '", spec, "'");
           break;
         }
@@ -236,7 +238,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
 
   // Write the type into *attr.
   if (is_list) {
-    VERIFY(str_util::ConsumePrefix(&spec, ")"),
+    VERIFY(absl::ConsumePrefix(&spec, ")"),
            "Expected ) to close 'list(', not: '", spec, "'");
     str_util::RemoveLeadingWhitespace(&spec);
     attr->set_type(strings::StrCat("list(", type, ")"));
@@ -245,7 +247,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Read optional minimum constraint at the end.
-  if ((is_list || type == "int") && str_util::ConsumePrefix(&spec, ">=")) {
+  if ((is_list || type == "int") && absl::ConsumePrefix(&spec, ">=")) {
     int64 min_limit = -999;
     VERIFY(ConsumeAttrNumber(&spec, &min_limit),
            "Could not parse integer lower limit after '>=', found '", spec,
@@ -255,7 +257,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Parse default value, if present.
-  if (str_util::ConsumePrefix(&spec, "=")) {
+  if (absl::ConsumePrefix(&spec, "=")) {
     str_util::RemoveLeadingWhitespace(&spec);
     VERIFY(ParseAttrValue(attr->type(), spec, attr->mutable_default_value()),
            "Could not parse default value '", spec, "'");
@@ -465,7 +467,7 @@ void FinalizeDoc(const string& text, OpDef* op_def,
 
   // Remove trailing spaces.
   for (string& line : lines) {
-    str_util::StripTrailingWhitespace(&line);
+    absl::StripTrailingAsciiWhitespace(&line);
   }
 
   // First non-blank line -> summary.
@@ -485,7 +487,7 @@ void FinalizeDoc(const string& text, OpDef* op_def,
   int end_l = l;
   // Trim trailing blank lines from the description.
   while (start_l < end_l && lines[end_l - 1].empty()) --end_l;
-  string desc = str_util::Join(
+  string desc = absl::StrJoin(
       gtl::ArraySlice<string>(lines.data() + start_l, end_l - start_l), "\n");
   if (!desc.empty()) op_def->set_description(desc);
 
@@ -520,7 +522,7 @@ void FinalizeDoc(const string& text, OpDef* op_def,
       if (!description[i].empty()) description[i].remove_prefix(min_indent);
     }
     // Concatenate lines into a single string.
-    const string complete(str_util::Join(description, "\n"));
+    const string complete(absl::StrJoin(description, "\n"));
 
     // Find name.
     bool found = false;
@@ -651,7 +653,7 @@ Status OpDefBuilder::Finalize(OpRegistrationData* op_reg_data) const {
   FinalizeDoc(doc_, op_def, &errors);
 
   if (errors.empty()) return Status::OK();
-  return errors::InvalidArgument(str_util::Join(errors, "\n"));
+  return errors::InvalidArgument(absl::StrJoin(errors, "\n"));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 9c47ac0f017..ab0cc9c7b5c 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -251,7 +251,7 @@ static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
 Status ValidateOpDef(const OpDef& op_def) {
   using ::tensorflow::strings::Scanner;
 
-  if (!str_util::StartsWith(op_def.name(), "_")) {
+  if (!absl::StartsWith(op_def.name(), "_")) {
     VALIDATE(Scanner(op_def.name())
                  .One(Scanner::UPPERLETTER)
                  .Any(Scanner::LETTER_DIGIT)
@@ -271,11 +271,11 @@ Status ValidateOpDef(const OpDef& op_def) {
 
     // Validate type
     StringPiece type(attr.type());
-    bool is_list = str_util::ConsumePrefix(&type, "list(");
+    bool is_list = absl::ConsumePrefix(&type, "list(");
     bool found = false;
     for (StringPiece valid : {"string", "int", "float", "bool", "type", "shape",
                               "tensor", "func"}) {
-      if (str_util::ConsumePrefix(&type, valid)) {
+      if (absl::ConsumePrefix(&type, valid)) {
         found = true;
         break;
       }
@@ -283,7 +283,7 @@ Status ValidateOpDef(const OpDef& op_def) {
     VALIDATE(found, "Unrecognized type '", type, "' in attr '", attr.name(),
              "'");
     if (is_list) {
-      VALIDATE(str_util::ConsumePrefix(&type, ")"),
+      VALIDATE(absl::ConsumePrefix(&type, ")"),
                "'list(' is missing ')' in attr ", attr.name(), "'s type ",
                attr.type());
     }
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 4514d92e387..c721d6df550 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -57,7 +57,7 @@ class ValidateOpDefTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << "Did not see error with: " << message;
     if (!status.ok()) {
       LOG(INFO) << "message: " << status;
-      EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
+      EXPECT_TRUE(absl::StrContains(status.ToString(), message))
           << "Actual: " << status << "\nExpected to contain: " << message;
     }
   }
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 4a94f97c767..ab5efc50f60 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <algorithm>
 #include <vector>
+
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -55,7 +57,7 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
     while (str_util::EndsWith(to_append, " ")) {
       to_append.remove_suffix(1);
     }
-    while (str_util::ConsumePrefix(&str, " ")) {
+    while (absl::ConsumePrefix(&str, " ")) {
     }
 
     // Go on to the next line.
@@ -67,9 +69,9 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
 }
 
 bool ConsumeEquals(StringPiece* description) {
-  if (str_util::ConsumePrefix(description, "=")) {
-    while (str_util::ConsumePrefix(description,
-                                   " ")) {  // Also remove spaces after "=".
+  if (absl::ConsumePrefix(description, "=")) {
+    while (absl::ConsumePrefix(description,
+                               " ")) {  // Also remove spaces after "=".
     }
     return true;
   }
@@ -101,7 +103,7 @@ static bool StartsWithFieldName(StringPiece line,
                                 const std::vector<string>& multi_line_fields) {
   StringPiece up_to_colon;
   if (!SplitAt(':', &line, &up_to_colon)) return false;
-  while (str_util::ConsumePrefix(&up_to_colon, " "))
+  while (absl::ConsumePrefix(&up_to_colon, " "))
     ;  // Remove leading spaces.
   for (const auto& field : multi_line_fields) {
     if (up_to_colon == field) {
@@ -122,9 +124,9 @@ static bool ConvertLine(StringPiece line,
   StringPiece up_to_colon;
   StringPiece after_colon = line;
   SplitAt(':', &after_colon, &up_to_colon);
-  while (str_util::ConsumePrefix(&after_colon, " "))
+  while (absl::ConsumePrefix(&after_colon, " "))
     ;  // Remove leading spaces.
-  if (!str_util::ConsumePrefix(&after_colon, "\"")) {
+  if (!absl::ConsumePrefix(&after_colon, "\"")) {
     // We only convert string fields, so don't convert this line.
     return false;
   }
@@ -138,7 +140,7 @@ static bool ConvertLine(StringPiece line,
   // We've now parsed line into '<up_to_colon>: "<escaped>"<suffix>'
 
   string unescaped;
-  if (!str_util::CUnescape(escaped, &unescaped, nullptr)) {
+  if (!absl::CUnescape(escaped, &unescaped, nullptr)) {
     // Error unescaping, abort the conversion.
     return false;
   }
@@ -184,9 +186,9 @@ string PBTxtToMultiline(StringPiece pbtxt,
 static bool FindMultiline(StringPiece line, size_t colon, string* end) {
   if (colon == StringPiece::npos) return false;
   line.remove_prefix(colon + 1);
-  while (str_util::ConsumePrefix(&line, " ")) {
+  while (absl::ConsumePrefix(&line, " ")) {
   }
-  if (str_util::ConsumePrefix(&line, "<<")) {
+  if (absl::ConsumePrefix(&line, "<<")) {
     *end = string(line);
     return true;
   }
@@ -230,7 +232,7 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
     bool first = true;
     while (!multiline_pbtxt.empty()) {
       SplitAt('\n', &multiline_pbtxt, &line);
-      if (str_util::ConsumePrefix(&line, end)) break;
+      if (absl::ConsumePrefix(&line, end)) break;
       if (first) {
         first = false;
       } else {
@@ -241,7 +243,7 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
     }
 
     // Escape what we extracted and then output it in quotes.
-    strings::StrAppend(&pbtxt, " \"", str_util::CEscape(unescaped), "\"", line,
+    strings::StrAppend(&pbtxt, " \"", absl::CEscape(unescaped), "\"", line,
                        "\n");
   }
   return pbtxt;
@@ -265,7 +267,7 @@ static void StringReplace(const string& from, const string& to, string* s) {
     }
   }
   // Join the pieces back together with a new delimiter.
-  *s = str_util::Join(split, to.c_str());
+  *s = absl::StrJoin(split, to);
 }
 
 static void RenameInDocs(const string& from, const string& to,
@@ -417,10 +419,10 @@ Status MergeApiDefs(ApiDef* base_api_def, const ApiDef& new_api_def) {
                              new_api_def.arg_order().end(),
                              base_api_def->arg_order().begin())) {
       return errors::FailedPrecondition(
-          "Invalid arg_order: ", str_util::Join(new_api_def.arg_order(), ", "),
+          "Invalid arg_order: ", absl::StrJoin(new_api_def.arg_order(), ", "),
           " for ", base_api_def->graph_op_name(),
           ". All elements in arg_order override must match base arg_order: ",
-          str_util::Join(base_api_def->arg_order(), ", "));
+          absl::StrJoin(base_api_def->arg_order(), ", "));
     }
 
     base_api_def->clear_arg_order();
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index f84b3520eb3..2b7723c810d 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -104,7 +104,7 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()),
       graph_def_version_(context->graph_def_version()),
-      is_internal_(str_util::StartsWith(type_string(), "_")),
+      is_internal_(absl::StartsWith(type_string(), "_")),
       cost_estimate_(OpKernel::kInitialCostEstimateCycles) {
   OP_REQUIRES_OK(context,
                  NameRangesForNode(*def_, *context->op_def_, &input_name_map_,
@@ -730,6 +730,15 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
   const DataType type = params_->op_kernel->output_type(index);
   DCHECK(!IsRefType(type));
   DCHECK(mutable_output(index) == nullptr);
+  if (attr.scope_id > 0) {
+    if (!allocated_scope_ids_.insert(attr.scope_id).second) {
+      return errors::Internal(
+          "OpKernel ", params_->op_kernel->name(),
+          " called allocate_output at index ", index, " with scope_id ",
+          attr.scope_id,
+          " more than once.  Try turning off the ScopedAllocator optimizer.");
+    }
+  }
   auto output_tensor = MakeUnique<Tensor>();
   Status s = allocate_tensor(type, shape, output_tensor.get(), attr);
   if (s.ok()) {
@@ -743,6 +752,22 @@ Status OpKernelContext::allocate_temp(
     DataType type, const TensorShape& shape, Tensor* out_temp,
     AllocatorAttributes allocator_attr,
     const AllocationAttributes& allocation_attr) {
+  if (allocator_attr.scope_id > 0) {
+    // We do not allow ScopedAllocator calls from allocate_temp.  Unlike
+    // allocate_persistent where we return an error if a kernel provides a
+    // meaningful scope_id, here we clear the scope_id and return a temporary
+    // buffer.  This is because it is legal for a kernel to call allocate_temp
+    // and then set_output with the temp tensor.
+    //
+    // We achieve memory correctness by forcing an allocation in set_output and
+    // copying over the tensor from the temp buffer.  Kernels which would like
+    // to avoid this performance penalty should switch to calling
+    // allocate_output.
+    VLOG(2) << "Warning: OpKernel " << params_->op_kernel->name()
+            << " called allocate_temp with scope_id " << allocator_attr.scope_id
+            << ".  Switch to allocate_output to avoid performance penalty.";
+    allocator_attr.scope_id = -1;
+  }
   Status s =
       allocate_tensor(type, shape, out_temp, allocator_attr, allocation_attr);
   if (track_allocations() && s.ok() && out_temp->TotalBytes() > 0) {
@@ -763,6 +788,13 @@ Status OpKernelContext::allocate_persistent(DataType type,
                                             PersistentTensor* out_persistent,
                                             Tensor** out_tensor,
                                             AllocatorAttributes attr) {
+  if (attr.scope_id > 0) {
+    // ScopedAllocator cannot be used for persistent tensors, because these
+    // tensors may persist across kernel invocations/steps, whereas the backing
+    // tensor for the scoped allocator will be reallocated every step.
+    return errors::Internal(
+        "Unexpected call to allocate_persistent with scope_id ", attr.scope_id);
+  }
   Tensor persistent;
   Status s = allocate_tensor(type, shape, &persistent, attr);
   if (s.ok()) {
@@ -807,22 +839,38 @@ void OpKernelContext::set_output(int index, const Tensor& tensor) {
   DCHECK(!IsRefType(type));
   DCHECK_EQ(mutable_output(index), nullptr);
 
+  bool allocate_and_copy = false;
   const bool never_forward =
       (params_->forward_from_array != nullptr &&
        params_->forward_from_array[index] == Params::kNeverForward);
   if (never_forward) {
+    if (allocated_scope_ids_.find(output_alloc_attr(index).scope_id) ==
+        allocated_scope_ids_.end()) {
+      allocate_and_copy = true;
+    } else {
+      // The output at `index` must have been previously allocated via a call to
+      // `allocate_output(index, ...)`.  That call would ensure that we return
+      // the correct slice of the ScopedAllocated buffer, so we do not
+      // re-allocate and copy here.
+      LOG(WARNING)
+          << "OpKernel " << params_->op_kernel->name()
+          << " called both allocate_output and set_output with scope_id "
+          << output_alloc_attr(index).scope_id;
+    }
+  }
+
+  if (allocate_and_copy) {
     // This output was marked to not be forwarded either during graph
     // construction or grappler passes.  Force an allocation and copy input to
     // output.
-    AllocatorAttributes allocator_attributes = output_alloc_attr(index);
     VLOG(1) << "OpKernelContext set_output index " << index << " tensor "
             << tensor.DebugString() << " never_forward " << never_forward
             << " params_->forward_from_array[index] "
             << params_->forward_from_array[index] << " alloc_attr.scope_id "
-            << allocator_attributes.scope_id;
+            << output_alloc_attr(index).scope_id;
     auto new_tensor = MakeUnique<Tensor>();
     Status s = allocate_tensor(type, tensor.shape(), new_tensor.get(),
-                               allocator_attributes);
+                               output_alloc_attr(index));
     TF_DCHECK_OK(s);
     device()->CopyTensorInSameDevice(&tensor, new_tensor.get(),
                                      op_device_context(), [](const Status&) {});
@@ -1030,7 +1078,7 @@ static Status IsProbablySafeToLoad(const string& path) {
   }
   if (!missing_features.empty()) {
     string errmsg = "Missing CPU features: ";
-    errmsg.append(str_util::Join(missing_features, ", "));
+    errmsg.append(absl::StrJoin(missing_features, ", "));
     return Status(errors::Code::FAILED_PRECONDITION, errmsg);
   }
   return Status::OK();
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 20702a49147..cc89296887e 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <atomic>
 #include <functional>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -1291,6 +1292,9 @@ class OpKernelContext {
   gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators_ GUARDED_BY(mu_);
   gtl::InlinedVector<TensorValue, 4> outputs_;
 
+  // Keep track of calls to ScopeAllocator.
+  std::unordered_set<int32> allocated_scope_ids_;
+
   // Constructed only if <params->record_tensor_accesses>.
   ManualConstructor<UniqueTensorReferences> referenced_tensors_ GUARDED_BY(mu_);
 
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index d8001cd0710..ae85bcb85da 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -401,6 +403,101 @@ TEST_F(OpKernelTest, InputDtype) {
   delete params.device;
 }
 
+// A mock device that mimics the behavior of scoped allocator upon calling
+// GetAllocator with a positive scope_id.
+class ScopedAllocatorDevice : public DeviceBase {
+ public:
+  explicit ScopedAllocatorDevice(Env* env)
+      : DeviceBase(env),
+        scope_allocated_(false),
+        num_allocations_(0),
+        num_scoped_allocations_(0) {}
+
+  Allocator* GetAllocator(AllocatorAttributes attrs) override {
+    CHECK_LE(attrs.scope_id, 0);
+    num_allocations_++;
+    return cpu_allocator();
+  }
+
+  Allocator* GetScopedAllocator(AllocatorAttributes attrs,
+                                int64 /*step_id*/) override {
+    CHECK_GT(attrs.scope_id, 0);
+    num_scoped_allocations_++;
+    if (scope_allocated_) {
+      return nullptr;
+    } else {
+      scope_allocated_ = true;
+      return cpu_allocator();
+    }
+  }
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override {
+    CHECK(input_tensor->NumElements() == output_tensor->NumElements());
+    tensor::DeepCopy(*input_tensor, output_tensor);
+    done(Status::OK());
+  }
+
+  // Return the count of calls to GetAllocator or GetScopedAllocator, depending
+  // on when scoped is false or true respectively.  For testing purposes.
+  int num_allocations(bool scoped) {
+    if (scoped) {
+      return num_scoped_allocations_;
+    } else {
+      return num_allocations_;
+    }
+  }
+
+ private:
+  bool scope_allocated_;
+  int num_allocations_;
+  int num_scoped_allocations_;
+};
+
+// Test that a kernel which has an output marked for allocation via
+// ScopedAllocator, which calls allocate_temp and set_output, does the right
+// thing.  In this case, the expected behavior is for allocate_temp to return
+// a temporary buffer, and set_output to copy the contents of this temp buffer
+// into the ScopedAllocator slice.
+TEST_F(OpKernelTest, ScopedAllocationTest) {
+  Env* env = Env::Default();
+  OpKernelContext::Params params;
+  params.record_tensor_accesses = false;
+  auto sa_device = absl::make_unique<ScopedAllocatorDevice>(env);
+  params.device = sa_device.get();
+  Status status;
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, params.device, cpu_allocator(),
+      CreateNodeDef("Test4", {DT_FLOAT}), TF_GRAPH_DEF_VERSION, &status));
+  EXPECT_TRUE(status.ok());
+  params.op_kernel = op.get();
+  AllocatorAttributes alloc_attrs;
+  alloc_attrs.scope_id = 1;
+  std::vector<AllocatorAttributes> output_alloc_attrs({alloc_attrs});
+  params.output_attr_array = output_alloc_attrs.data();
+  std::vector<int> forward_from({OpKernelContext::Params::kNeverForward});
+  params.forward_from_array = forward_from.data();
+  auto ctx = absl::make_unique<OpKernelContext>(&params);
+
+  EXPECT_EQ(sa_device->num_allocations(false), 0);
+  EXPECT_EQ(sa_device->num_allocations(true), 0);
+  Tensor temp1;
+  TF_EXPECT_OK(
+      ctx->allocate_temp(DT_FLOAT, TensorShape({8}), &temp1, alloc_attrs));
+  EXPECT_EQ(sa_device->num_allocations(false), 1);
+  EXPECT_EQ(sa_device->num_allocations(true), 0);
+  Tensor temp2;
+  alloc_attrs.scope_id = -1;
+  TF_EXPECT_OK(
+      ctx->allocate_temp(DT_FLOAT, TensorShape({4}), &temp2, alloc_attrs));
+  EXPECT_EQ(sa_device->num_allocations(false), 2);
+  EXPECT_EQ(sa_device->num_allocations(true), 0);
+  ctx->set_output(0, temp1);
+  EXPECT_EQ(sa_device->num_allocations(false), 2);
+  EXPECT_EQ(sa_device->num_allocations(true), 1);
+}
+
 class OpKernelBuilderTest : public ::testing::Test {
  protected:
   // Each attr is described by a "name|type|value".
@@ -579,9 +676,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
                                             {"T|list(type)|[DT_FLOAT]"}));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
-  EXPECT_TRUE(str_util::StrContains(
-      GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}),
-      "Invalid argument: "));
+  EXPECT_TRUE(
+      absl::StrContains(GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}),
+                        "Invalid argument: "));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {"T|int|7"},
                 error::INVALID_ARGUMENT);
@@ -598,7 +695,7 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernel", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
@@ -618,7 +715,7 @@ TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernelForT", DEVICE_CPU, {"T|type|DT_FLOAT"},
@@ -640,9 +737,9 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(),
-                            "OpKernel 'BadConstraint' has constraint on attr "
-                            "'T' not in NodeDef"));
+      absl::StrContains(status.error_message(),
+                        "OpKernel 'BadConstraint' has constraint on attr "
+                        "'T' not in NodeDef"));
 
   ExpectFailure("BadConstraint", DEVICE_CPU, {"dtype|type|DT_FLOAT"},
                 error::INVALID_ARGUMENT);
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index c50d9161d61..1281b121a91 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -168,6 +168,7 @@ class LocalRendezvousImpl : public Rendezvous {
       // There is no waiter for this message. Append the message
       // into the queue. The waiter will pick it up when arrives.
       // Only send-related fields need to be filled.
+      VLOG(2) << "Enqueue Send Item (key:" << key.FullKey() << "). ";
       Item* item = new Item;
       item->value = val;
       item->is_dead = is_dead;
@@ -180,11 +181,13 @@ class LocalRendezvousImpl : public Rendezvous {
       return Status::OK();
     }
 
+    VLOG(2) << "Consume Recv Item (key:" << key.FullKey() << "). ";
     // There is an earliest waiter to consume this message.
     Item* item = queue->front();
 
     // Delete the queue when the last element has been consumed.
     if (queue->size() == 1) {
+      VLOG(2) << "Clean up Send/Recv queu (key:" << key.FullKey() << "). ";
       table_.erase(key_hash);
     } else {
       queue->pop_front();
@@ -217,6 +220,7 @@ class LocalRendezvousImpl : public Rendezvous {
     if (queue->empty() || !queue->front()->IsSendValue()) {
       // There is no message to pick up.
       // Only recv-related fields need to be filled.
+      VLOG(2) << "Enqueue Recv Item (key:" << key.FullKey() << "). ";
       Item* item = new Item;
       item->waiter = std::move(done);
       item->recv_args = recv_args;
@@ -228,12 +232,14 @@ class LocalRendezvousImpl : public Rendezvous {
       return;
     }
 
+    VLOG(2) << "Consume Send Item (key:" << key.FullKey() << "). ";
     // A message has already arrived and is queued in the table under
     // this key.  Consumes the message and invokes the done closure.
     Item* item = queue->front();
 
     // Delete the queue when the last element has been consumed.
     if (queue->size() == 1) {
+      VLOG(2) << "Clean up Send/Recv queu (key:" << key.FullKey() << "). ";
       table_.erase(key_hash);
     } else {
       queue->pop_front();
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 7e028d3f394..72cbd0abe09 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -104,21 +104,6 @@ ResourceMgr::ResourceMgr(const string& default_container)
 
 ResourceMgr::~ResourceMgr() { Clear(); }
 
-void ResourceMgr::GetContainerResources(
-    const string& container, std::vector<ResourceEntry>* resources) const {
-  resources->clear();
-  mutex_lock l(mu_);
-  Container* b = gtl::FindPtrOrNull(containers_, container);
-  if (b != nullptr) {
-    resources->reserve(b->size());
-    for (auto& key_resource : *b) {
-      ResourceBase* resource = key_resource.second;
-      resource->Ref();
-      resources->emplace_back(key_resource.first.second, resource);
-    }
-  }
-}
-
 void ResourceMgr::Clear() {
   mutex_lock l(mu_);
   for (const auto& p : containers_) {
@@ -158,7 +143,7 @@ string ResourceMgr::DebugString() const {
         line.type.c_str(), line.resource->c_str(), line.detail.c_str()));
   }
   std::sort(text.begin(), text.end());
-  return str_util::Join(text, "\n");
+  return absl::StrJoin(text, "\n");
 }
 
 Status ResourceMgr::DoCreate(const string& container, TypeIndex type,
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 61f60e8726b..da547d5829f 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -145,19 +145,6 @@ class ResourceMgr {
                     std::vector<std::unique_ptr<T, core::RefCountDeleter>>*
                         resources) const TF_MUST_USE_RESULT;
 
-  // Retrieves all the resources within a container. If the container does not
-  // exist, it will not be created and the result vector will be empty. The
-  // resource member of the returned ResourceEntry data structures will own
-  // a reference to the ResourceBase object(s).
-  struct ResourceEntry {
-    ResourceEntry(string name, ResourceBase* resource)
-        : name(std::move(name)), resource(resource) {}
-    string name;
-    std::unique_ptr<ResourceBase, core::RefCountDeleter> resource;
-  };
-  void GetContainerResources(const string& container,
-                             std::vector<ResourceEntry>* resources) const;
-
   // If "container" has a resource "name", returns it in
   // "*resource". Otherwise, invokes creator() to create the resource.
   // The caller takes the ownership of one ref on "*resource".
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 1c785736e60..2f5880e9396 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -73,7 +73,7 @@ string LookupOrCreate(ResourceMgr* rm, const string& container,
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 18a278f07ff..759bfdc939c 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -334,7 +334,7 @@ string InferenceContext::DebugString(ShapeHandle s) {
   if (RankKnown(s)) {
     std::vector<string> vals;
     for (auto d : s->dims_) vals.push_back(DebugString(d));
-    return strings::StrCat("[", str_util::Join(vals, ","), "]");
+    return strings::StrCat("[", absl::StrJoin(vals, ","), "]");
   } else {
     return "?";
   }
@@ -360,7 +360,7 @@ string InferenceContext::DebugString(
   for (const ShapeAndType& s : shape_and_types) {
     pieces.push_back(DebugString(s));
   }
-  return strings::StrCat("[", str_util::Join(pieces, ","), "]");
+  return strings::StrCat("[", absl::StrJoin(pieces, ","), "]");
 }
 
 Status InferenceContext::WithRank(ShapeHandle shape, int64 rank,
@@ -1176,15 +1176,15 @@ Status InferenceContext::AttachContext(const Status& status) {
 
   string error_context = strings::StrCat(
       " for '", node_def_->name(), "' (op: '", node_def_->op(),
-      "') with input shapes: ", str_util::Join(input_shapes, ", "));
+      "') with input shapes: ", absl::StrJoin(input_shapes, ", "));
   if (!input_from_tensors_str.empty()) {
     strings::StrAppend(&error_context, " and with computed input tensors: ",
-                       str_util::Join(input_from_tensors_str, ", "));
+                       absl::StrJoin(input_from_tensors_str, ", "));
   }
   if (!input_from_tensors_as_shape_str.empty()) {
     strings::StrAppend(&error_context,
                        " and with input tensors computed as partial shapes: ",
-                       str_util::Join(input_from_tensors_as_shape_str, ","));
+                       absl::StrJoin(input_from_tensors_as_shape_str, ","));
   }
 
   strings::StrAppend(&error_context, ".");
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index 586c38e43bb..00bd71a868c 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -153,7 +153,7 @@ TEST_F(ShapeInferenceTest, Run) {
     };
     Status s = c.Run(fn);
     // Extra error message is attached when Run fails.
-    EXPECT_TRUE(str_util::StrContains(
+    EXPECT_TRUE(absl::StrContains(
         s.ToString(),
         "Shape must be at most rank 0 but is rank 1 for 'foo' (op: 'foo_op')"))
         << s;
@@ -367,7 +367,7 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
 
   // WithRankAtMost on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.WithRankAtMost(in1, 2, &s1).ToString(),
       "Invalid argument: Shape must be at most rank 2 but is rank 3"));
 
@@ -405,7 +405,7 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
 
   // WithRankAtLeast on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.WithRankAtLeast(in1, 4, &s1).ToString(),
       "Invalid argument: Shape must be at least rank 4 but is rank 3"));
 
@@ -448,13 +448,13 @@ TEST_F(ShapeInferenceTest, WithValue) {
   out1 = d0;
 
   EXPECT_TRUE(
-      str_util::StrContains(c.WithValue(d0, 0, &out1).ToString(),
-                            "Invalid argument: Dimension must be 0 but is 1"));
+      absl::StrContains(c.WithValue(d0, 0, &out1).ToString(),
+                        "Invalid argument: Dimension must be 0 but is 1"));
   EXPECT_FALSE(IsSet(out1));
   out1 = d0;
   EXPECT_TRUE(
-      str_util::StrContains(c.WithValue(d0, 2, &out1).ToString(),
-                            "Invalid argument: Dimension must be 2 but is 1"));
+      absl::StrContains(c.WithValue(d0, 2, &out1).ToString(),
+                        "Invalid argument: Dimension must be 2 but is 1"));
 
   EXPECT_FALSE(IsSet(out1));
   EXPECT_TRUE(c.WithValue(d0, 1, &out1).ok());
@@ -513,12 +513,12 @@ TEST_F(ShapeInferenceTest, MergeDim) {
   EXPECT_EQ(3, merged_dims.size());
 
   // Merging unequal values is an error.
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Merge(d2, d1, &out).ToString(),
       "Invalid argument: Dimensions must be equal, but are 2 and 1"));
 
   EXPECT_FALSE(IsSet(out));
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Merge(d1, d2, &out).ToString(),
       "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
@@ -727,21 +727,21 @@ TEST_F(ShapeInferenceTest, MergeShape) {
 
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Merge(s_u_2, s_1_3, &out).ToString(),
       "Invalid argument: Dimension 1 in both shapes must be equal, but "
       "are 2 and 3"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Merge(s_1_3, s_u_2, &out).ToString(),
       "Invalid argument: Dimension 1 in both shapes must be equal, but "
       "are 3 and 2"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Merge(s_1, s_1_2, &out).ToString(),
       "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
 
@@ -790,7 +790,7 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   // Incompatible merges give errors and set outs to nullptr.
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString(),
       "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
@@ -799,7 +799,7 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
 
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString(),
       "Invalid argument: Shape must be at least rank 3 but is rank 2"));
   EXPECT_FALSE(IsSet(s_out));
@@ -859,21 +859,21 @@ TEST_F(ShapeInferenceTest, Subshape) {
 
   // Errors.
   out = unknown;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Subshape(in0, 6, -3, &out).ToString(),
       "Invalid argument: Subshape must have computed start <= end, but is 5 "
       "and 2 (computed from start 6 and end -3 over shape with rank 5)"));
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, -50, 100, &out).ToString(),
-                                    "Invalid argument: Subshape start out of "
-                                    "bounds: -50, for shape with rank 5"));
+  EXPECT_TRUE(absl::StrContains(c.Subshape(in0, -50, 100, &out).ToString(),
+                                "Invalid argument: Subshape start out of "
+                                "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, 0, -50, &out).ToString(),
-                                    "Invalid argument: Subshape end out of "
-                                    "bounds: -50, for shape with rank 5"));
+  EXPECT_TRUE(absl::StrContains(c.Subshape(in0, 0, -50, &out).ToString(),
+                                "Invalid argument: Subshape end out of "
+                                "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1086,31 +1086,31 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   EXPECT_EQ("?", create(&t));
 
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       create(&t), "Input tensor must be int32 or int64, but was float"));
 
   t = ::tensorflow::test::AsScalar<int32>(1);
   auto s_scalar = create(&t);
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s_scalar,
       "Input tensor must be rank 1, or if its rank 0 it must have value -1"))
       << s_scalar;
 
   t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
   auto s_matrix = create(&t);
-  EXPECT_TRUE(str_util::StrContains(
-      s_matrix, "Input tensor must be rank 1, but was rank 2"))
+  EXPECT_TRUE(absl::StrContains(s_matrix,
+                                "Input tensor must be rank 1, but was rank 2"))
       << s_matrix;
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int64>({3, -2, 1});
-  EXPECT_TRUE(str_util::StrContains(
-      create(&t), "Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(absl::StrContains(create(&t),
+                                "Invalid value in tensor used for shape: -2"));
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int32>({3, -2, 1});
-  EXPECT_TRUE(str_util::StrContains(
-      create(&t), "Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(absl::StrContains(create(&t),
+                                "Invalid value in tensor used for shape: -2"));
 
   // Test when the input shape is wrong.
   {
@@ -1168,9 +1168,9 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
   EXPECT_TRUE(c.MakeShapeFromShapeProto(proto, &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
   proto.add_dim()->set_size(0);
-  EXPECT_TRUE(str_util::StrContains(
-      c.MakeShapeFromShapeProto(proto, &out).error_message(),
-      "An unknown shape must not have any dimensions set."));
+  EXPECT_TRUE(
+      absl::StrContains(c.MakeShapeFromShapeProto(proto, &out).error_message(),
+                        "An unknown shape must not have any dimensions set."));
   EXPECT_FALSE(IsSet(out));
 
   // With known rank.
@@ -1184,7 +1184,7 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
 
   // With invalid dimension value.
   proto.add_dim()->set_size(-2);
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.MakeShapeFromShapeProto(proto, &out).error_message(),
       "Shape [0,?,1000,-2] has dimensions with values below -1 "
       "(where -1 means unknown)"));
@@ -1254,9 +1254,9 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_EQ("20", c.DebugString(d));
 
   EXPECT_TRUE(
-      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
-                            "Dimension size, given by scalar input 1, must be "
-                            "non-negative but is -1"));
+      absl::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                        "Dimension size, given by scalar input 1, must be "
+                        "non-negative but is -1"));
 
   // Same tests, with int64 values.
   t1 = tensorflow::test::AsScalar<int64>(20);
@@ -1265,9 +1265,9 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_EQ("20", c.DebugString(d));
 
   EXPECT_TRUE(
-      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
-                            "Dimension size, given by scalar input 1, must be "
-                            "non-negative but is -1"));
+      absl::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                        "Dimension size, given by scalar input 1, must be "
+                        "non-negative but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, GetAttr) {
@@ -1320,18 +1320,18 @@ TEST_F(ShapeInferenceTest, Divide) {
   EXPECT_TRUE(c.Divide(d_6, d_2, evenly_divisible, &out).ok());
   EXPECT_EQ("3", c.DebugString(out));
 
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Divide(d_6, 5, evenly_divisible, &out).error_message(),
       "Dimension size must be evenly divisible by 5 but is 6"));
 
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
       "Divisor must be positive but is 0"));
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Divide(d_6, d_0, evenly_divisible, &out).error_message(),
       "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
       "Divisor must be positive but is -1"));
 
@@ -1340,11 +1340,11 @@ TEST_F(ShapeInferenceTest, Divide) {
   EXPECT_TRUE(c.Divide(d_6, 5, evenly_divisible, &out).ok());
   EXPECT_EQ("1", c.DebugString(out));
 
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
       "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
       "Divisor must be positive but is -1"));
 }
@@ -1394,7 +1394,7 @@ TEST_F(ShapeInferenceTest, Add) {
   EXPECT_TRUE(c.Add(d_0, d_6, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out).error_message(),
       "Dimension size overflow from adding 6 and 9223372036854775802"));
 }
@@ -1444,7 +1444,7 @@ TEST_F(ShapeInferenceTest, Subtract) {
   EXPECT_TRUE(c.Subtract(d_6, d_0, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       c.Subtract(d_5, d_6, &out).error_message(),
       "Negative dimension size caused by subtracting 6 from 5"));
 }
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index b54dd220ab9..98e6cc8db4d 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -100,7 +100,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
       }
     }
 
-    if (str_util::StartsWith(expected, "in")) {
+    if (absl::StartsWith(expected, "in")) {
       if (in_index == -1) {
         return Unknown(err_prefix,
                        " should have matched an input shape by "
@@ -135,8 +135,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     }
 
     // Verify the dimensions.
-    CHECK(str_util::StartsWith(expected, "[") &&
-          str_util::EndsWith(expected, "]"))
+    CHECK(absl::StartsWith(expected, "[") && str_util::EndsWith(expected, "]"))
         << expected;
     expected.remove_prefix(1);
     expected.remove_suffix(1);
@@ -178,7 +177,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
           return Unknown(err_prefix, " expected to be unknown but was ",
                          c.Value(out_dim), err_suffix);
         }
-      } else if (str_util::StartsWith(expected_dim, "d")) {
+      } else if (absl::StartsWith(expected_dim, "d")) {
         // Compare the dimension values.
         auto v = str_util::Split(expected_dim, '|');
         if (in_dim_idx.first == -1) {
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index a4405b502cb..f401d650e6d 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -26,11 +26,10 @@ namespace shape_inference {
 
 namespace {
 
-#define EXPECT_CONTAINS(str, substr)                            \
-  do {                                                          \
-    string s = (str);                                           \
-    EXPECT_TRUE(::tensorflow::str_util::StrContains(s, substr)) \
-        << "String: " << s;                                     \
+#define EXPECT_CONTAINS(str, substr)                              \
+  do {                                                            \
+    string s = (str);                                             \
+    EXPECT_TRUE(absl::StrContains(s, substr)) << "String: " << s; \
   } while (false)
 
 static OpShapeInferenceFn* global_fn_ptr = nullptr;
@@ -100,7 +99,7 @@ TEST(ShapeInferenceTestutilTest, Failures) {
                            ShapeInferenceTestOp("NoSuchOp"), "", "")
                            .error_message();
   EXPECT_TRUE(
-      str_util::StartsWith(error_message, "Op type not registered 'NoSuchOp'"));
+      absl::StartsWith(error_message, "Op type not registered 'NoSuchOp'"));
 
   // Wrong shape error messages.
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "?", fn_copy_input_0),
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 1bd2a43dc98..c2b3b7d19e7 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/resource_handle.pb.h"
@@ -964,9 +965,9 @@ inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a,
 }
 inline string PrintOneElement(const string& a, bool print_v2) {
   if (print_v2) {
-    return "\"" + str_util::CEscape(a) + "\"";
+    return "\"" + absl::CEscape(a) + "\"";
   } else {
-    return str_util::CEscape(a);
+    return absl::CEscape(a);
   }
 }
 inline float PrintOneElement(const Eigen::half& h, bool print_v2) {
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 7158f1925f6..a446e13c009 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -759,7 +759,7 @@ Status TensorShapeUtils::NumElements(gtl::ArraySlice<int64> shape,
     n = MultiplyWithoutOverflow(n, dim);
     if (n < 0) {
       return errors::InvalidArgument("Can't compute total size of shape [",
-                                     str_util::Join(shape, ","),
+                                     absl::StrJoin(shape, ","),
                                      "]; product would overflow int64");
     }
   }
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index d25652ce818..ea93009ef40 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -479,7 +479,7 @@ TensorShapeIterOld TensorShapeOld::end() const {
 
 string TensorShapeOld::DebugString() const {
   return strings::StrCat(
-      "[", str_util::Join(gtl::ArraySlice<int64>(dim_sizes_), ","), "]");
+      "[", absl::StrJoin(gtl::ArraySlice<int64>(dim_sizes_), ","), "]");
 }
 
 string TensorShapeOld::DebugString(const TensorShapeProto& proto) {
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index 432fbf5bed3..5356f9f9c99 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -67,7 +67,7 @@ enum DataType {
   DT_UINT64_REF = 123;
 }
 // LINT.ThenChange(
-//    https://www.tensorflow.org/code/tensorflow/c/c_api.h,
+//    https://www.tensorflow.org/code/tensorflow/c/tf_datatype.h,
 //    https://www.tensorflow.org/code/tensorflow/go/tensor.go,
 //    https://www.tensorflow.org/code/tensorflow/core/framework/tensor.cc,
 //    https://www.tensorflow.org/code/tensorflow/core/framework/types.h,
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 16b069c70a7..63fb35081cd 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -141,8 +141,8 @@ TEST(TypesTest, ComplexTypes) {
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
     const string name = DataTypeString(dt);
-    EXPECT_EQ(DataTypeIsInteger(dt), str_util::StartsWith(name, "int") ||
-                                         str_util::StartsWith(name, "uint"))
+    EXPECT_EQ(DataTypeIsInteger(dt),
+              absl::StartsWith(name, "int") || absl::StartsWith(name, "uint"))
         << "DataTypeInteger failed for " << name;
   }
 }
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index daa744e8774..25cddc00a3a 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -260,8 +260,8 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
   ClientSession session(root);
   std::vector<Tensor> outputs;
   Status s = session.Run({create_const}, &outputs);
-  EXPECT_TRUE(str_util::StrContains(s.error_message(),
-                                    "GPU copy from non-DMA string tensor"))
+  EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                "GPU copy from non-DMA string tensor"))
       << s.ToString();
 }
 
@@ -367,9 +367,9 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPUStringFailsSafely) {
   Status err = session.Run({create_op, identity}, &outputs);
   EXPECT_EQ(err.code(), errors::Code::INVALID_ARGUMENT);
   EXPECT_TRUE(
-      str_util::StrContains(err.error_message(),
-                            "During Variant Host->Device Copy: non-DMA-copy "
-                            "attempted of tensor type: string"))
+      absl::StrContains(err.error_message(),
+                        "During Variant Host->Device Copy: non-DMA-copy "
+                        "attempted of tensor type: string"))
       << err.error_message();
 }
 
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index e1a46ebd59d..0a4874aeae5 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -188,8 +188,7 @@ TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
   Status s0 = UnaryOpVariant<CPUDevice>(null_context_pointer,
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
+  EXPECT_TRUE(absl::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -214,8 +213,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
   Status s0 = UnaryOpVariant<GPUDevice>(null_context_pointer,
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
+  EXPECT_TRUE(absl::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -261,7 +259,7 @@ TEST(VariantOpAddRegistryTest, TestBasicCPU) {
   Status s0 = BinaryOpVariants<CPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
+  EXPECT_TRUE(absl::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
@@ -288,7 +286,7 @@ TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   Status s0 = BinaryOpVariants<GPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
+  EXPECT_TRUE(absl::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 66237a34979..76ff906adaf 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -67,8 +67,8 @@ Status ValidateControlFlowInfo(const Graph* graph,
       // BackPropLoopCounter runs in the same frame as the backprop loop. They
       // are the only cases that multiple loops share the same frame.
       if (frame.loop_cond &&
-          !str_util::StrContains(frame.loop_cond->name(), "LoopCounter") &&
-          !str_util::StrContains(node->name(), "LoopCounter")) {
+          !absl::StrContains(frame.loop_cond->name(), "LoopCounter") &&
+          !absl::StrContains(node->name(), "LoopCounter")) {
         return errors::InvalidArgument(
             "Invalid loop structure: Loop \"", cf.frame_name,
             "\" has more than one LoopCond node: ", FormatNodeForError(*node),
diff --git a/tensorflow/core/graph/control_flow_test.cc b/tensorflow/core/graph/control_flow_test.cc
index 803c757c3ff..49f593f276c 100644
--- a/tensorflow/core/graph/control_flow_test.cc
+++ b/tensorflow/core/graph/control_flow_test.cc
@@ -60,17 +60,17 @@ TEST(ValidateControlFlowTest, InputsFromDifferentFrames) {
   std::vector<ControlFlowInfo> info;
   Status status = BuildControlFlowInfo(graph.get(), &info);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "has inputs from different frames"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "has inputs from different frames"))
       << status.error_message();
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "{{node outer/body/inner/Merge}}"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "{{node outer/body/inner/Merge}}"))
       << status.error_message();
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "{{node outer/body/inner/Enter}}"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "{{node outer/body/inner/Enter}}"))
       << status.error_message();
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "{{node outer/Switch}}"))
+      absl::StrContains(status.error_message(), "{{node outer/Switch}}"))
       << status.error_message();
 }
 
@@ -109,9 +109,9 @@ TEST(ValidateControlFlowTest, MismatchedParentFrames) {
   status = BuildControlFlowInfo(graph.get(), &info);
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Mismatched parent frames"))
+      absl::StrContains(status.error_message(), "Mismatched parent frames"))
       << status.error_message();
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node Enter2}}"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node Enter2}}"))
       << status.error_message();
 }
 
@@ -133,14 +133,13 @@ TEST(ValidateControlFlowTest, TwoLoopCond) {
   std::vector<ControlFlowInfo> info;
   Status status = BuildControlFlowInfo(graph.get(), &info);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                    "more than one LoopCond node"))
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "more than one LoopCond node"))
       << status.error_message();
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "{{node sub/LoopCond}}"))
+      absl::StrContains(status.error_message(), "{{node sub/LoopCond}}"))
       << status.error_message();
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "{{node LoopCond}}"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node LoopCond}}"))
       << status.error_message();
 }
 
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index c19bad8f296..c87fab7d969 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -487,7 +487,7 @@ Status GraphConstructor::BuildNodeIndex() {
     bool in_control_dependence = false;
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
-      if (!input_name.empty() && str_util::StartsWith(input_name, "^")) {
+      if (!input_name.empty() && absl::StartsWith(input_name, "^")) {
         in_control_dependence = true;
       } else if (in_control_dependence) {
         return errors::InvalidArgument(
@@ -535,7 +535,7 @@ Status GraphConstructor::InitFromEdges() {
       bool has_loop_back_edge = false;
       for (int i = 0; i < node_def.input_size(); ++i) {
         StringPiece input_name(node_def.input(i));
-        if (str_util::StartsWith(input_name, "^")) {
+        if (absl::StartsWith(input_name, "^")) {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
@@ -585,7 +585,7 @@ Status GraphConstructor::ValidateColocationConstraints(
   if (iter == node_def.attr().end()) return Status::OK();
   for (const string& c : iter->second.list().s()) {
     StringPiece s(c);
-    if (str_util::ConsumePrefix(&s, kColocationGroupPrefix) &&
+    if (absl::ConsumePrefix(&s, kColocationGroupPrefix) &&
         gdef_nodes_.find(s) == gdef_nodes_.end()) {
       return errors::InvalidArgument(
           "Node '", node_def.name(),
@@ -824,7 +824,7 @@ void GraphConstructor::AddPrefixToNodeDef(
     // imported).
     if (input_already_exists[i]) continue;
     StringPiece input(node_def->input(i));
-    if (str_util::ConsumePrefix(&input, "^")) {
+    if (absl::ConsumePrefix(&input, "^")) {
       node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
       node_def->set_input(i, strings::StrCat(prefix_, input));
@@ -836,7 +836,7 @@ void GraphConstructor::AddPrefixToNodeDef(
         node_def->mutable_attr()->at(kColocationAttrName).mutable_list();
     for (int i = 0; i < list->s_size(); ++i) {
       StringPiece v(list->s(i));
-      if (str_util::ConsumePrefix(&v, kColocationGroupPrefix)) {
+      if (absl::ConsumePrefix(&v, kColocationGroupPrefix)) {
         list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
@@ -879,7 +879,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     bool updated = false;
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
-      if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
+      if (absl::ConsumePrefix(&val, kColocationGroupPrefix)) {
         auto name_pair = uniquified_names_.find(string(val));
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 13b8ecc5f1e..bcbfe8bdd55 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -156,8 +156,7 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     StringPiece loc(value[0]);
-    return str_util::ConsumePrefix(&loc, kColocationGroupPrefix) ? string(loc)
-                                                                 : "";
+    return absl::ConsumePrefix(&loc, kColocationGroupPrefix) ? string(loc) : "";
   }
 
   string GraphDebugString() const {
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index eabb52dff5e..a13769b3315 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -412,7 +412,7 @@ string ControlLoopName(const string& name) {
 
 bool IsControlLoop(const Node* node) {
   const string& name = node->name();
-  return str_util::StartsWith(name, "_cloop");
+  return absl::StartsWith(name, "_cloop");
 }
 
 // An enter node for control flow.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 29d8034d2a1..61512ef2131 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -121,7 +121,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
       if (ndef.op() == "_Recv") {
         bool has_control = false;
         for (const string& input_name : ndef.input()) {
-          if (str_util::StartsWith(input_name, "^")) {
+          if (absl::StartsWith(input_name, "^")) {
             has_control = true;
             break;
           }
@@ -129,7 +129,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
         EXPECT_TRUE(has_control);
       }
       // Must have a control loop
-      if (str_util::StartsWith(ndef.name(), "_cloop")) {
+      if (absl::StartsWith(ndef.name(), "_cloop")) {
         if (ndef.op() == "Enter") {
           has_control_enter = true;
         }
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 64e0fa70e5c..032ab738d4a 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -410,7 +410,7 @@ TEST_F(GraphTest, NewName) {
   EXPECT_NE(a1, a2);
   EXPECT_NE(a1, b1);
   EXPECT_NE(a2, b1);
-  EXPECT_TRUE(str_util::StartsWith(a1, "A")) << a1;
+  EXPECT_TRUE(absl::StartsWith(a1, "A")) << a1;
 }
 
 TEST_F(GraphTest, IsValidNode) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 6e7393e3c66..739ceab0a97 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -863,14 +863,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
+        !absl::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
+        !absl::StrContains(n->def().device(), kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -1409,7 +1409,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     TF_CHECK_OK(GetNodeAttr(n->def(), "fused_ops", &fused_ops));
     return (fused_ops == std::vector<string>{"BiasAdd"} ||
             fused_ops == std::vector<string>{"Relu"} ||
-            fused_ops == std::vector<string>{"BiasAdd", "Relu"});
+            fused_ops == std::vector<string>{"BiasAdd", "Relu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"});
   }
 
   // Rewrites input node to a new node specified by its matching rewrite info.
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index a074d3057f4..b4c6515b8af 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -107,8 +107,8 @@ class MklLayoutPassTest : public ::testing::Test {
     // Canonicalize
     std::sort(nodes.begin(), nodes.end());
     std::sort(edges.begin(), edges.end());
-    return strings::StrCat(str_util::Join(nodes, ";"), "|",
-                           str_util::Join(edges, ";"));
+    return strings::StrCat(absl::StrJoin(nodes, ";"), "|",
+                           absl::StrJoin(edges, ";"));
   }
 
   string DoMklLayoutOptimizationPass() {
@@ -1357,6 +1357,64 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive3) {
             "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
+// Rewrite test for _FusedConv2D Op with BiasAdd+Add fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive4) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 2 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Add'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C', 'D']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['E', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklFusedConv2D);F(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "B->E:1;C->E:2;D->E:3;D->F:1;DMT/_0->E:4;DMT/_1->E:5;"
+            "DMT/_2->E:6;DMT/_3->E:7;E->F");
+}
+
+// Rewrite test for _FusedConv2D Op with BiasAdd+Add+Relu fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive5) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 2 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Add', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C', 'D']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['E', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklFusedConv2D);F(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;"
+            "C->E:2;D->E:3;D->F:1;DMT/_0->E:4;DMT/_1->E:5;DMT/_2->E:6;"
+            "DMT/_3->E:7;E->F");
+}
+
 // Rewrite test for _FusedConv2D Op with unsupported fusion
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Negative1) {
   InitGraph(
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index 319437a8016..68b6750961f 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -85,8 +85,8 @@ class MklToTfConversionPass : public ::testing::Test {
     // Canonicalize
     std::sort(nodes.begin(), nodes.end());
     std::sort(edges.begin(), edges.end());
-    return strings::StrCat(str_util::Join(nodes, ";"), "|",
-                           str_util::Join(edges, ";"));
+    return strings::StrCat(absl::StrJoin(nodes, ";"), "|",
+                           absl::StrJoin(edges, ";"));
   }
 
   string DoRunMklToTfConversionPass() {
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 6dc9a50b98a..121c781637f 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -116,7 +116,7 @@ Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const {
   // In case of error, set *created_node to nullptr.
   if (created_node != nullptr) *created_node = nullptr;
   if (!errors_.empty()) {
-    return errors::InvalidArgument(str_util::Join(errors_, "\n"));
+    return errors::InvalidArgument(absl::StrJoin(errors_, "\n"));
   }
 
   NodeDef node_def;
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 642298fa95d..21affb608a6 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -82,8 +82,8 @@ class OptimizerCSETest : public ::testing::Test {
     // Canonicalize
     std::sort(nodes.begin(), nodes.end());
     std::sort(edges.begin(), edges.end());
-    return strings::StrCat(str_util::Join(nodes, ";"), "|",
-                           str_util::Join(edges, ";"));
+    return strings::StrCat(absl::StrJoin(nodes, ";"), "|",
+                           absl::StrJoin(edges, ";"));
   }
 
   string DoCSE(const std::function<bool(const Node*)>& consider_fn = nullptr) {
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index e46f92bc24d..0e55a1ae3b8 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -216,7 +216,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"),
                       &found_node);
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
@@ -270,7 +270,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"),
                       &found_node);
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 6c014a8d443..5877d376027 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -75,7 +75,7 @@ class SubgraphTest : public ::testing::Test {
     }
     std::sort(actual_nodes.begin(), actual_nodes.end());
 
-    LOG(INFO) << "Nodes present: " << str_util::Join(actual_nodes, " ");
+    LOG(INFO) << "Nodes present: " << absl::StrJoin(actual_nodes, " ");
 
     std::vector<string> expected_nodes = str_util::Split(nodes, ',');
     std::sort(expected_nodes.begin(), expected_nodes.end());
@@ -88,8 +88,8 @@ class SubgraphTest : public ::testing::Test {
     }
 
     EXPECT_TRUE(actual_nodes.size() == expected_nodes.size())
-        << "\nActual:   " << str_util::Join(actual_nodes, ",")
-        << "\nExpected: " << str_util::Join(expected_nodes, ",");
+        << "\nActual:   " << absl::StrJoin(actual_nodes, ",")
+        << "\nExpected: " << absl::StrJoin(expected_nodes, ",");
   }
 
   bool HasEdge(const string& src, int src_out, const string& dst, int dst_in) {
@@ -313,7 +313,7 @@ TEST_F(SubgraphTest, ChainOfFools) {
 }
 
 static bool HasSubstr(StringPiece base, StringPiece substr) {
-  bool ok = str_util::StrContains(base, substr);
+  bool ok = absl::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc
index 90b65a60b1e..fc04177363c 100644
--- a/tensorflow/core/graph/tensor_id.cc
+++ b/tensorflow/core/graph/tensor_id.cc
@@ -51,7 +51,7 @@ TensorId ParseTensorName(StringPiece name) {
   if (p > base && *p == ':' && mul > 1) {
     id.first = StringPiece(base, p - base);
     id.second = index;
-  } else if (str_util::StartsWith(name, "^")) {
+  } else if (absl::StartsWith(name, "^")) {
     // Control edge
     id.first = StringPiece(base + 1);
     id.second = Graph::kControlSlot;
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 0e74a30c7a9..01aa4bb9ceb 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -271,11 +271,12 @@ Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
   return ret;
 }
 
-Node* Error(Graph* g, Node* input, const string& errmsg) {
+Node* Error(Graph* g, Node* input, const string& errmsg, bool log_error) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
                   .Input(input)
                   .Attr("message", errmsg)
+                  .Attr("log_error", log_error)
                   .Finalize(g, &ret));
   return ret;
 }
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 0c7233161f4..6088457916f 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -131,7 +131,8 @@ Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
 
 // Adds an error node in "g". The node's computation always
 // generates an error with the given error message "errmsg".
-Node* Error(Graph* g, Node* input, const string& errmsg);
+Node* Error(Graph* g, Node* input, const string& errmsg,
+            bool log_error = false);
 
 // Adds a node that generates a invalid ref output.
 Node* InvalidRefType(Graph* g, DataType out_type, DataType invalid_type);
diff --git a/tensorflow/core/graph/validate.cc b/tensorflow/core/graph/validate.cc
index 4487f738c8e..a0ad059df56 100644
--- a/tensorflow/core/graph/validate.cc
+++ b/tensorflow/core/graph/validate.cc
@@ -110,7 +110,7 @@ Status ValidateGraphHasNoCycle(const Graph& graph) {
     return errors::InvalidArgument(
         "Graph is invalid, contains a cycle with ",
         graph.num_nodes() - processed,
-        " nodes, including: ", str_util::Join(nodes_in_cycle, ", "));
+        " nodes, including: ", absl::StrJoin(nodes_in_cycle, ", "));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index f6a0d2614ac..f56cd73fa8a 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -61,7 +61,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -84,7 +84,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -92,7 +92,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   // Validation should still fail.
   s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "NodeDef missing attr"));
 }
 
 TEST(ValidateGraphDefAgainstOpListTest, GraphWithOpOnlyInOpList) {
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index b5f223facbd..9e349a370d1 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index fc550900d33..e53140e9f22 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 519d5ed8759..0f415cf0392 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -128,6 +128,11 @@ class Cluster {
                      const std::vector<string>& fetch,
                      RunMetadata* metadata) = 0;
 
+  // Run the specified GrapplerItem and return the corresponding metadata.
+  virtual Status Run(const GrapplerItem& item, RunMetadata* metadata) {
+    return Run(item.graph, item.feed, item.fetch, metadata);
+  }
+
  protected:
   std::unordered_map<string, DeviceProperties> devices_;
   const int timeout_s_;
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index 2839d33c552..e76472291f9 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -67,14 +67,17 @@ Status VirtualCluster::Run(const GraphDef& graph,
                            const std::vector<std::pair<string, Tensor>>& feed,
                            const std::vector<string>& fetch,
                            RunMetadata* metadata) {
-  // Initializes an analytical cost estimator to estimate the graph cost. Makes
-  // sure to use static shape inference to prevent the virtual scheduler from
-  // calling the Run method on the cluster and creating an infinite loop.
   GrapplerItem item;
   item.graph = graph;
   item.feed = feed;
   item.fetch = fetch;
+  return Run(item, metadata);
+}
 
+Status VirtualCluster::Run(const GrapplerItem& item, RunMetadata* metadata) {
+  // Initializes an analytical cost estimator to estimate the graph cost. Makes
+  // sure to use static shape inference to prevent the virtual scheduler from
+  // calling the Run method on the cluster and creating an infinite loop.
   if (metadata) {
     metadata->clear_step_stats();
     metadata->clear_cost_graph();
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index 94446a998a6..6ce9dde38ab 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -45,9 +45,10 @@ class VirtualCluster : public Cluster {
 
   Status Provision() override;
   Status Initialize(const GrapplerItem& item) override;
-  Status Run(const GraphDef& item,
+  Status Run(const GraphDef& graph,
              const std::vector<std::pair<string, Tensor>>& feed,
              const std::vector<string>& fetch, RunMetadata* metadata) override;
+  Status Run(const GrapplerItem& item, RunMetadata* metadata) override;
   const DeviceSet* GetDeviceSet() const override { return device_set_; }
 
  private:
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index c87b2506a1b..88689d474f7 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -1,6 +1,8 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
-load("//tensorflow:tensorflow.bzl", "tf_cuda_library", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_protos_grappler",
@@ -17,8 +19,8 @@ filegroup(
 
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library",
     "tf_additional_all_protos",
+    "tf_proto_library",
 )
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index 81afe7aaf74..a7e81847cac 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -142,15 +142,15 @@ AnalyticalCostEstimator::AnalyticalCostEstimator(
 }
 
 Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
-  item_ = item;
+  item_ = &item;
   return Status::OK();
 }
 
 Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
                                              RunMetadata* run_metadata,
                                              Costs* costs) const {
-  GrapplerItem item = item_;
-  item.graph = optimized_graph;
+  GraphDef graph_copy = optimized_graph;
+  GrapplerItem item = item_->WithGraph(std::move(graph_copy));
 
   auto status = scheduler_->Init(&item);
   if (!status.ok()) {
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
index 914f5839ad5..8387a886c67 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -55,7 +55,6 @@ class AnalyticalCostEstimator : public CostEstimator {
                           bool use_aggressive_shape_inference);
   ~AnalyticalCostEstimator() override {}
 
-  // Initializes the estimator for the specified grappler item.
   // This implementation always returns OK.
   Status Initialize(const GrapplerItem& item) override;
 
@@ -68,7 +67,7 @@ class AnalyticalCostEstimator : public CostEstimator {
   const VirtualScheduler* GetScheduler() const { return scheduler_.get(); }
 
  private:
-  GrapplerItem item_;
+  const GrapplerItem* item_;
   std::unique_ptr<OpLevelCostEstimator> node_estimator_;
   std::unique_ptr<ReadyNodeManager> node_manager_;
   std::unique_ptr<VirtualScheduler> scheduler_;
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index b01aca610a8..020e8cf1d1f 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -37,7 +37,7 @@ Status GraphMemory::InferStatically(
   TF_RETURN_IF_ERROR(cluster.Provision());
   TF_RETURN_IF_ERROR(cluster.Initialize(item_));
   RunMetadata metadata;
-  Status s = cluster.Run(item_.graph, item_.feed, item_.fetch, &metadata);
+  Status s = cluster.Run(item_, &metadata);
   // The virtual cluster returns the RESOURCE_EXHAUSTED error when it detects
   // that the model would run out of memory. We still get the metadata we need
   // out of the simulation, so we just ignore this error.
@@ -55,8 +55,7 @@ Status GraphMemory::InferDynamically(Cluster* cluster) {
 
   TF_RETURN_IF_ERROR(cluster->Initialize(item_));
   RunMetadata metadata;
-  TF_RETURN_IF_ERROR(
-      cluster->Run(item_.graph, item_.feed, item_.fetch, &metadata));
+  TF_RETURN_IF_ERROR(cluster->Run(item_, &metadata));
   InferFromTrace(metadata.step_stats());
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/costs/graph_memory.h b/tensorflow/core/grappler/costs/graph_memory.h
index a8ae4cc49f0..e759d95921a 100644
--- a/tensorflow/core/grappler/costs/graph_memory.h
+++ b/tensorflow/core/grappler/costs/graph_memory.h
@@ -69,7 +69,7 @@ class GraphMemory {
 
   void InferFromTrace(const StepStats& timeline);
 
-  GrapplerItem item_;
+  const GrapplerItem& item_;
   std::unordered_map<string, int64> worst_case_memory_usage_;
   std::unordered_map<string, MemoryUsage> peak_usage_;
   const MemoryUsage unknown_usage_;
diff --git a/tensorflow/core/grappler/costs/virtual_placer.cc b/tensorflow/core/grappler/costs/virtual_placer.cc
index b492bed7a77..d5cdad9c111 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@@ -66,7 +66,7 @@ VirtualPlacer::VirtualPlacer(
       if (parsed) {
         // Parsed devices are stored to cpu_devices or gpu_devices map,
         // addressed (and ordered) by device id.
-        const auto type = str_util::Lowercase(parsed_name.type);
+        const auto type = absl::AsciiStrToLower(parsed_name.type);
         if (type == "gpu") {
           gpu_devices[parsed_name.id] = cluster_device_name;
         } else if (type == "cpu") {
@@ -141,7 +141,7 @@ string VirtualPlacer::get_canonical_device_name(const NodeDef& node) const {
 
 string VirtualPlacer::to_lfqn_or_empty(const string& device_name) const {
   DeviceNameUtils::ParsedName parsed_name;
-  const auto lowercase_name = str_util::Lowercase(device_name);
+  const auto lowercase_name = absl::AsciiStrToLower(device_name);
   bool parsed = DeviceNameUtils::ParseFullName(lowercase_name, &parsed_name);
   if (!parsed) {
     parsed = DeviceNameUtils::ParseLocalName(lowercase_name, &parsed_name);
@@ -163,7 +163,7 @@ string VirtualPlacer::to_lfqn_or_empty(const string& device_name) const {
   }
 
   // Have to do this, because parser returns uppercase types for CPU and GPU.
-  parsed_name.type = str_util::Lowercase(parsed_name.type);
+  parsed_name.type = absl::AsciiStrToLower(parsed_name.type);
 
   string lfqn = strings::StrCat(
       "/job:", parsed_name.job, "/replica:", parsed_name.replica,
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index bc69e77a37f..556f5251cc5 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -517,7 +517,7 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
     // of feed and fetch nodes, by default we consider all placeholders as feed
     // nodes, but some of them may not be needed for the default fetch node.
     VLOG(1) << "Some feed nodes were not consumed by the fetch fanin: "
-            << str_util::Join(feed_nodes, ",");
+            << absl::StrJoin(feed_nodes, ",");
   }
 
   initialized_ = true;
diff --git a/tensorflow/core/grappler/graph_analyzer/BUILD b/tensorflow/core/grappler/graph_analyzer/BUILD
index d56a08d3c8b..fd70bb07aca 100644
--- a/tensorflow/core/grappler/graph_analyzer/BUILD
+++ b/tensorflow/core/grappler/graph_analyzer/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 57949b322d6..0f1fa759543 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -98,6 +98,9 @@ struct GrapplerItem {
     // undefined type parameters in the function signature, or placeholder
     // attributes in the function body).
     bool optimize_function_library = true;
+
+    // Mark the grapper optimization run in eager mode or not.
+    bool is_eager_mode = false;
   };
 
   const std::unordered_set<string>& devices() const;
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 286c30cd356..04fea25c609 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index bcac0e79a24..a414fc63749 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -245,12 +245,14 @@ bool IsFloorMod(const NodeDef& node) { return node.op() == "FloorMod"; }
 
 bool IsFusedBatchNorm(const NodeDef& node) {
   const auto& op = node.op();
-  return op == "FusedBatchNorm" || op == "FusedBatchNormV2";
+  return op == "FusedBatchNorm" || op == "FusedBatchNormV2" ||
+         op == "FusedBatchNormV3";
 }
 
 bool IsFusedBatchNormGrad(const NodeDef& node) {
   const auto& op = node.op();
-  return op == "FusedBatchNormGrad" || op == "FusedBatchNormGradV2";
+  return op == "FusedBatchNormGrad" || op == "FusedBatchNormGradV2" ||
+         op == "FusedBatchNormGradV3";
 }
 
 bool IsGreater(const NodeDef& node) { return node.op() == "Greater"; }
@@ -687,7 +689,7 @@ bool ModifiesInputsInPlace(const NodeDef& node) {
   }
 
   std::transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
-  if (str_util::StrContains(op_name, "inplace")) {
+  if (absl::StrContains(op_name, "inplace")) {
     return true;
   }
   return GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace");
@@ -942,8 +944,8 @@ bool NeverForwardsInputs(const NodeDef& node) {
                                 "RandomPoissonV2"}));
   const string& op_name = node.op();
   return kNonForwardingOps->count(op_name) > 0 ||
-         str_util::StrContains(op_name, "Segment") ||
-         str_util::StartsWith(op_name, "Quantize");
+         absl::StrContains(op_name, "Segment") ||
+         absl::StartsWith(op_name, "Quantize");
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 49de09ac748..c29a6d14fed 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,6 +1,9 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 # Platform specific build config
@@ -805,6 +808,29 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_remapper_test",
+    srcs = ["mkl_remapper_test.cc"],
+    tags = [
+        "no_mac",
+        "no_oss",
+    ],
+    deps = [
+        ":remapper",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 cc_library(
     name = "debug_stripper",
     srcs = ["debug_stripper.cc"],
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 6f801d48f8b..20843f400d9 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -533,10 +533,10 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     using SigKV = decltype(shape_sig_to_inputs)::value_type;
     VLOG(3) << "Add/AddN group has " << shape_sig_to_inputs.size()
             << " unique shapes: "
-            << str_util::Join(shape_sig_to_inputs, ", ",
-                              [](string* out, SigKV p) {
-                                strings::StrAppend(out, p.first);
-                              });
+            << absl::StrJoin(shape_sig_to_inputs, ", ",
+                             [](string* out, SigKV p) {
+                               strings::StrAppend(out, p.first);
+                             });
 
     // Collect all the shapes from representative elements.
     std::vector<TensorShapeProto> shapes;
@@ -2032,7 +2032,7 @@ class ReorderCastLikeAndValuePreserving : public ArithmeticOptimizerStage {
   // GPU. The transpose might not be implemented for image.type, or
   // might be slower with image.type than with cast_dst_type.
   bool NodeIsOnCpuOrGpu(const NodeDef* node) const {
-    using str_util::StrContains;
+    using absl::StrContains;
 
     string task;
     string device;
@@ -2348,7 +2348,7 @@ class ReplaceMulWithSquare : public ArithmeticOptimizerStage {
     string device;
     bool is_on_cpu =
         DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
-        str_util::StrContains(device, DEVICE_CPU);
+        absl::StrContains(device, DEVICE_CPU);
 
     if (!is_complex || is_on_cpu) {
       NodeDef* new_square_node = AddCopyNode(optimized_node_name, node);
@@ -2968,7 +2968,7 @@ class UnaryOpsComposition : public ArithmeticOptimizerStage {
     std::reverse(op_names.begin(), op_names.end());
 
     VLOG(2) << "Fuse unary ops: root=" << root->name() << " op_names=["
-            << str_util::Join(op_names, ", ") << "]";
+            << absl::StrJoin(op_names, ", ") << "]";
 
     NodeDef* composition_node = ctx().optimized_graph->add_node();
     composition_node->set_name(OptimizedNodeName(*root));
@@ -3010,7 +3010,7 @@ class UnaryOpsComposition : public ArithmeticOptimizerStage {
 
   // UnaryOpsComposition is defined only for CPU.
   bool NodeIsOnCpu(const NodeDef& node) const {
-    using str_util::StartsWith;
+    using absl::StartsWith;
 
     string task;
     string device;
@@ -3601,7 +3601,7 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<FuseSquaredDiffStage>(ctx, ctx_ext);
 
   VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
-          << str_util::Join(pipeline.StageNames(), ", ");
+          << absl::StrJoin(pipeline.StageNames(), ", ");
 
   while (!nodes_to_simplify.Empty()) {
     GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 63468c8c678..116050c5332 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -1479,35 +1479,37 @@ void AutoMixedPrecisionImpl::PropagateWhiteThroughClear(
   }
 }
 
-// Forces NextIteration nodes to have the same color as their output Merge node.
+// Forces NextIteration nodes and their output Merge node(s) to have the same
+// color. Specifically, it removes them all from white_set if any of the Merge
+// nodes is not in white_set, otherwise it adds the NextIteration node to
+// white_set.
 Status AutoMixedPrecisionImpl::ForceColorMatchOnRecurrentEdges(
     absl::flat_hash_set<int>* white_set) const {
   for (const NodeDef& node : graph_->node()) {
     if (node.op() == "NextIteration") {
       GraphView::OutputPort output_port(&node, 0);
       const auto& fanout = graph_view_.GetFanout(output_port);
-      if (fanout.size() != 1) {
-        return errors::FailedPrecondition(
-            "Expected exactly 1 output from port ", node.name(), ":0, got ",
-            fanout.size());
+      std::vector<int> merge_idxs;
+      merge_idxs.reserve(fanout.size());
+      bool any_merge_is_not_white = false;
+      for (const auto& output : fanout) {
+        const NodeDef& merge_node = *output.node;
+        if (merge_node.op() != "Merge") {
+          return errors::FailedPrecondition(
+              "Expected Merge node after NextIteration, got ", merge_node.op());
+        }
+        const absl::optional<int> maybe_merge_idx =
+            graph_type_view_.GetNodeIndex(merge_node.name(), TypeAttrId("T"));
+        if (!maybe_merge_idx.has_value()) {
+          return errors::Internal("Type attribute T of Merge node ",
+                                  merge_node.name(),
+                                  " not found in graph view");
+        }
+        int merge_idx = maybe_merge_idx.value();
+        merge_idxs.push_back(merge_idx);
+        any_merge_is_not_white =
+            any_merge_is_not_white || !white_set->count(merge_idx);
       }
-      const NodeDef& merge_node = *fanout.begin()->node;
-      if (merge_node.op() != "Merge") {
-        return errors::FailedPrecondition(
-            "Expected Merge node after NextIteration, got ", merge_node.op());
-      }
-      const absl::optional<int> maybe_merge_idx =
-          graph_type_view_.GetNodeIndex(merge_node.name(), TypeAttrId("T"));
-      if (!maybe_merge_idx.has_value()) {
-        return errors::Internal("Type attribute T of Merge node ",
-                                merge_node.name(), " not found in graph view");
-      }
-      int merge_idx = maybe_merge_idx.value();
-      bool merge_is_white = white_set->count(merge_idx);
-      VLOG(2) << "Painting type T of " << node.op() << " node " << node.name()
-              << " " << (merge_is_white ? "WHITE" : "BLACK")
-              << " to match the color of its output Merge node "
-              << merge_node.name();
       const absl::optional<int> maybe_nextiter_idx =
           graph_type_view_.GetNodeIndex(node.name(), TypeAttrId("T"));
       if (!maybe_nextiter_idx.has_value()) {
@@ -1515,10 +1517,25 @@ Status AutoMixedPrecisionImpl::ForceColorMatchOnRecurrentEdges(
                                 node.name(), " not found in graph view");
       }
       int nextiter_idx = maybe_nextiter_idx.value();
-      if (merge_is_white) {
-        white_set->insert(nextiter_idx);
+      if (any_merge_is_not_white) {
+        for (int merge_idx : merge_idxs) {
+          if (white_set->erase(merge_idx)) {
+            VLOG(2) << "Painting type T of Merge node "
+                    << graph_type_view_.GetNode(merge_idx)->node->name()
+                    << " BLACK to match the color of its sibling Merge nodes "
+                       "with common NextIteration node "
+                    << node.name();
+          }
+        }
+        if (white_set->erase(nextiter_idx)) {
+          VLOG(2) << "Painting type T of NextIteration node " << node.name()
+                  << " BLACK to match the color of its output Merge node(s)";
+        }
       } else {
-        white_set->erase(nextiter_idx);
+        if (white_set->insert(nextiter_idx).second) {
+          VLOG(2) << "Painting type T of NextIteration node " << node.name()
+                  << " WHITE to match the color of its output Merge node(s)";
+        }
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 862401ba6ac..e093014c7dd 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -118,6 +118,8 @@ class AutoMixedPrecisionLists {
         "FloorDiv",
         "FusedBatchNormV2",
         "FusedBatchNormGradV2",
+        "FusedBatchNormV3",
+        "FusedBatchNormGradV3",
         "Inv",
         "LeakyRelu",
         "LeakyReluGrad",
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 954434e9a2c..6d1efa67bba 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -550,6 +551,68 @@ TEST_F(AutoMixedPrecisionTest, ExistingCast) {
   }
 }
 
+TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
+  Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
+  Output ent1 =
+      ops::internal::Enter(s.WithOpName("ent1"), blk1, "loop1").output;
+  // Note that the second input is later replaced with "nxt1".
+  Output mrg1 = ops::Merge(s.WithOpName("mrg1"), {ent1, ent1}).output;
+  // For simplicity, the loop condition is constant false.
+  Output con1 = ops::Const(s.WithOpName("con1"), false, {});
+  Output lpc1 = ops::LoopCond(s.WithOpName("lpc1"), con1).output;
+  auto swt1 = ops::Switch(s.WithOpName("swt1"), mrg1, lpc1);
+  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), swt1.output_true);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), gry1, gry1);
+  Output nxt1 = ops::NextIteration(s.WithOpName("nxt1"), wht1);
+  Output ext1 = ops::internal::Exit(s.WithOpName("ext1"), swt1.output_false);
+  Output fetch = ops::Identity(s.WithOpName("fetch"), ext1);
+  // Add a second merge node from the same NextIteration node. This case arises
+  // during graph optimization of some models.
+  auto mrg2 = ops::Merge(s.WithOpName("mrg2"), {ent1, nxt1});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  NodeMap node_map_original(&item.graph);
+  auto merge_node = node_map_original.GetNode("mrg1");
+  // Modify the graph to create a loop.
+  merge_node->set_input(1, "nxt1");
+  // Add a control edge to ensure the loop condition is inside the frame.
+  auto const_node = node_map_original.GetNode("con1");
+  const_node->add_input("^mrg1");
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
+  // Note that mrg1 gets painted black because it is between blk1 and gry1. This
+  // forces nxt1 and mrg2 to be painted black as well (they would otherwise be
+  // painted white because they are clear and have a direct path to wht1).
+  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("ent1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("mrg1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("swt1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
+  EXPECT_EQ(output_view.GetNode("nxt1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("ext1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("mrg2")->attr().at("T").type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+}
+
 TEST_F(AutoMixedPrecisionTest, TensorArray) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto ta1 = ops::TensorArray(s.WithOpName("ta1"), 8, DT_FLOAT);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index da98356446b..26fd9e5fd78 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -896,6 +897,13 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (op_def->output_arg_size() == 0) {
     return false;
   }
+  // Don't fold DT_VARIANT outputs as this can cause problems with XLA compile.
+  // TODO(rmlarsen): Only do this for XLA_* devices.
+  for (const OpDef::ArgDef& output_arg : op_def->output_arg()) {
+    if (output_arg.type() == DT_VARIANT) {
+      return false;
+    }
+  }
 
   // No need to (and don't) fold nodes that have no outgoing edges except
   // whitelisted nodes. Such nodes could be introduced by an earlier constant
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 2b64f3ac04d..6ee32252be3 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -3233,7 +3233,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
         EXPECT_EQ("ConstantFolding/acc6_partial_split_2", node.input(1));
         EXPECT_EQ("y", node.input(2));
       }
-      if (str_util::StartsWith(node.name(), "ConstantFolding/")) {
+      if (absl::StartsWith(node.name(), "ConstantFolding/")) {
         EXPECT_EQ("Const", node.op());
       }
     }
@@ -3318,7 +3318,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_Concat) {
       EXPECT_EQ("x", node.input(1));
       EXPECT_EQ("y", node.input(2));
       EXPECT_EQ("axis", node.input(3));
-    } else if (str_util::StartsWith(node.name(), "ConstantFolding/")) {
+    } else if (absl::StartsWith(node.name(), "ConstantFolding/")) {
       EXPECT_EQ("Const", node.op());
     } else {
       EXPECT_EQ(item.graph.node(i).DebugString(), node.DebugString());
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index d3858d6b1f2..7ba63a0bde9 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -1,13 +1,14 @@
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
-package(default_visibility = [
-    "//tensorflow/core/grappler/optimizers/data:__subpackages__",
-    "//tensorflow/core/kernels/data:__pkg__",
-    "//tensorflow/core/kernels/data/experimental:__pkg__",
-])
+package(
+    default_visibility = [
+        "//tensorflow/core/grappler/optimizers/data:__subpackages__",
+        "//tensorflow/core/kernels/data:__pkg__",
+        "//tensorflow/core/kernels/data/experimental:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "data",
@@ -397,6 +398,9 @@ cc_library(
         ":fusion_utils",
         ":optimizer_base",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index 0652df4dc5e..ff9ccd54996 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -50,7 +50,7 @@ constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ZipDataset"
 };
 
-constexpr std::array<const char*, 23> kPassThroughOps = {
+constexpr std::array<const char*, 22> kPassThroughOps = {
     "_Retval",
     "BatchDataset",
     "BatchDatasetV2",
@@ -59,7 +59,6 @@ constexpr std::array<const char*, 23> kPassThroughOps = {
     "PaddedBatchDatasetV2",
     "CacheDataset",
     "FilterDataset",
-    "FilterByLastComponentDataset",
     "Identity",
     "MapDataset",
     "ModelDataset",
@@ -209,7 +208,7 @@ bool ReaderOpInFunction(const NodeDef& node,
     NodeDef node_in_func = func->node_def(i);
     if (IsDatasetNodeOfType(node_in_func, kReaderDatasetOps) &&
         node_in_func.input_size() > 0 &&
-        str_util::StartsWith(node_in_func.input(0), "args_0")) {
+        absl::StartsWith(node_in_func.input(0), "args_0")) {
       return true;
     }
     if (IsDatasetNodeOfType(func->node_def(i), kFuncDatasetOps) &&
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index e257683b35d..f5ddf17c09e 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -16,8 +16,14 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h"
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
@@ -27,7 +33,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/kernels/function_ops.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -76,19 +84,80 @@ NodeDef MakeFusedNode(const NodeDef& map_node,
   return fused_node;
 }
 
-NodeDef MakeFilterByLastComponentNode(const NodeDef& fused_map_node,
-                                      const NodeDef& filter_node,
-                                      MutableGraphView* graph) {
-  NodeDef filter_by_component;
-  graph_utils::SetUniqueGraphNodeName("FilterByLastComponent", graph->graph(),
-                                      &filter_by_component);
-  filter_by_component.set_op("FilterByLastComponentDataset");
-  filter_by_component.add_input(fused_map_node.name());
+NodeDef MakeFilterNode(const NodeDef& fused_map,
+                       const FunctionDef& fused_map_func,
+                       MutableGraphView* graph, FunctionDefLibrary* library) {
+  NodeDef filter_node;
+  graph_utils::SetUniqueGraphNodeName("FilterByLast", graph->graph(),
+                                      &filter_node);
+  filter_node.set_op("FilterDataset");
+  filter_node.add_input(fused_map.name());
 
   for (auto key : {"output_shapes", "output_types"}) {
-    (*filter_by_component.mutable_attr())[key] = filter_node.attr().at(key);
+    graph_utils::CopyAttribute(key, fused_map, &filter_node);
   }
-  return filter_by_component;
+
+  AddNodeAttr("Targuments", std::vector<DataType>({}), &filter_node);
+
+  OpDef fused_sig = fused_map_func.signature();
+  FunctionDef* func = library->add_function();
+  OpDef* sig = func->mutable_signature();
+  sig->set_name("GetLast");
+  for (const auto& arg : fused_sig.output_arg()) {
+    *(sig->add_input_arg()) = arg;
+  }
+  OpDef::ArgDef* arg = sig->add_output_arg();
+  arg->set_name("predicate_result");
+  arg->set_description("predicate result computed in the fused map");
+  arg->set_type(DT_BOOL);
+  sig->set_description("returns the last argument");
+  (*func->mutable_ret())["predicate_result"] = strings::StrCat(
+      fused_sig.output_arg(fused_sig.output_arg_size() - 1).name(), ":0");
+
+  (*filter_node.mutable_attr())["predicate"] =
+      FunctionDefHelper::FunctionRef(func->signature().name()).proto;
+  return filter_node;
+}
+
+NodeDef MakeMapNode(const NodeDef& updated_filter, const NodeDef& original_map,
+                    const FunctionDef& fused_map_func, MutableGraphView* graph,
+                    FunctionDefLibrary* library) {
+  NodeDef map_node;
+  graph_utils::SetUniqueGraphNodeName("DropLast", graph->graph(), &map_node);
+  // We use MapDataset even if the original map was ParallelMap. Non-parallel
+  // map is more performant for simple short-circuit functions like (x, y) -> x.
+  map_node.set_op("MapDataset");
+  map_node.add_input(updated_filter.name());
+
+  for (auto key : {"output_shapes", "output_types"}) {
+    graph_utils::CopyAttribute(key, original_map, &map_node);
+  }
+
+  AddNodeAttr("Targuments", std::vector<DataType>({}), &map_node);
+
+  for (auto key : {"use_inter_op_parallelism", "preserve_cardinality"}) {
+    if (gtl::FindOrNull(original_map.attr(), key)) {
+      graph_utils::CopyAttribute(key, original_map, &map_node);
+    }
+  }
+
+  OpDef fused_sig = fused_map_func.signature();
+  FunctionDef* func = library->add_function();
+  OpDef* sig = func->mutable_signature();
+  sig->set_name("DropLast");
+  for (const auto& o : fused_sig.output_arg()) {
+    *(sig->add_input_arg()) = o;
+  }
+  for (int i = 0; i < fused_sig.output_arg_size() - 1; ++i) {
+    auto arg_i = fused_sig.output_arg(i);
+    *(sig->add_output_arg()) = arg_i;
+    (*func->mutable_ret())[arg_i.name()] = strings::StrCat(arg_i.name(), ":0");
+  }
+  sig->set_description("drops the last argument");
+
+  (*map_node.mutable_attr())["f"] =
+      FunctionDefHelper::FunctionRef(func->signature().name()).proto;
+  return map_node;
 }
 
 }  // namespace
@@ -155,11 +224,15 @@ Status MapAndFilterFusion::OptimizeAndCollectStats(Cluster* cluster,
     const auto* fused_maps =
         graph.AddNode(MakeFusedNode(*map_node, *fused_function, &graph));
 
-    const auto* filter_by_component = graph.AddNode(
-        MakeFilterByLastComponentNode(*fused_maps, *filter_node, &graph));
+    const auto* new_filter_node = graph.AddNode(MakeFilterNode(
+        *fused_maps, *fused_function, &graph, output->mutable_library()));
+
+    const auto* new_map_node =
+        graph.AddNode(MakeMapNode(*new_filter_node, *map_node, *fused_function,
+                                  &graph, output->mutable_library()));
 
     TF_RETURN_IF_ERROR(
-        graph.UpdateFanouts(filter_node->name(), filter_by_component->name()));
+        graph.UpdateFanouts(filter_node->name(), new_map_node->name()));
     TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_function));
 
     // TODO(prazek): we could also remove functions from library if they are not
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
index 8b3c95d37c1..41c08a18d20 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
@@ -23,9 +23,13 @@ namespace grappler {
 
 // This transformation fuses map and filter operations by moving computation of
 // filter predicate to MapDataset, which as a result produces an extra boolean
-// component. The FilterDataset is transformed to FilterByLastComponent - a
-// custom kernel that filters elements based on a value of the boolean
-// component.
+// component. We filter by the boolean component, then project it away.
+//
+// In symbols, we transform map(x -> f(x)).filter(f(x) -> p(f(x))) into
+// map(x -> f(x), p(f(x))).filter(f(x), p(f(x)) -> p(f(x))).map(f(x), p(f(x))
+// -> f(x)). This is more efficient because the latter filter and map operations
+// can be performed short-circuit, so only the first map requires an executor
+// invocation.
 class MapAndFilterFusion : public TFDataOptimizerBase {
  public:
   MapAndFilterFusion() = default;
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
index c5a5e22aba6..85ddd567500 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
@@ -53,10 +53,9 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilter) {
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapDataset", output));
-
-  EXPECT_TRUE(
-      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
+  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
+            2);
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("FilterDataset", output));
 }
 
 TEST(MapAndFilterFusionTest, FuseParallelMapAndFilter) {
@@ -84,14 +83,15 @@ TEST(MapAndFilterFusionTest, FuseParallelMapAndFilter) {
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output))
-      << output.DebugString();
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapDataset", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("FilterDataset", output));
+
   auto& map_node = output.node(
       graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output));
   EXPECT_FALSE(map_node.attr().at("sloppy").b()) << map_node.DebugString();
-  EXPECT_TRUE(
-      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output))
-      << output.DebugString();
+  // input dataset + num_parallel_calls
+  EXPECT_EQ(map_node.input_size(), 2);
 }
 
 TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
@@ -117,26 +117,10 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
-  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("MapDataset", output));
-  ASSERT_TRUE(
-      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
-  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
-
-  int map_id = graph_utils::FindGraphNodeWithOp("MapDataset", output);
-  auto& map_node = output.node(map_id);
-  ASSERT_EQ(map_node.input_size(), 1);
-  EXPECT_EQ(map_node.input(0), "range");
-
-  int filter_by_component_id =
-      graph_utils::FindGraphNodeWithOp("FilterByLastComponentDataset", output);
-  auto& filter_by_component = output.node(filter_by_component_id);
-  ASSERT_EQ(filter_by_component.input_size(), 1);
-  EXPECT_EQ(filter_by_component.input(0), map_node.name());
-
-  int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output);
-  auto& cache_node = output.node(cache_id);
-  ASSERT_EQ(cache_node.input_size(), 2);
-  EXPECT_EQ(cache_node.input(0), filter_by_component.name());
+  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
+            2);
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("FilterDataset", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
 }
 
 TEST(MapAndFilterFusionTest, FuseParallelMapAndFilterWithExtraChild) {
@@ -166,27 +150,15 @@ TEST(MapAndFilterFusionTest, FuseParallelMapAndFilterWithExtraChild) {
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("FilterDataset", output));
   ASSERT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output));
-  ASSERT_TRUE(
-      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
-  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
 
-  int map_id = graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output);
-  auto& map_node = output.node(map_id);
-  ASSERT_EQ(map_node.input_size(), 2);
-  EXPECT_EQ(map_node.input(0), "range");
-  EXPECT_EQ(map_node.input(1), "num_parallel_calls");
-
-  int filter_by_component_id =
-      graph_utils::FindGraphNodeWithOp("FilterByLastComponentDataset", output);
-  auto& filter_by_component = output.node(filter_by_component_id);
-  ASSERT_EQ(filter_by_component.input_size(), 1);
-  EXPECT_EQ(filter_by_component.input(0), map_node.name());
-
-  int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output);
-  auto& cache_node = output.node(cache_id);
-  ASSERT_EQ(cache_node.input_size(), 2);
-  EXPECT_EQ(cache_node.input(0), filter_by_component.name());
+  auto& map_node = output.node(
+      graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output));
+  EXPECT_TRUE(map_node.attr().at("sloppy").b()) << map_node.DebugString();
+  // input dataset + num_parallel_calls
+  EXPECT_EQ(map_node.input_size(), 2);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.cc b/tensorflow/core/grappler/optimizers/data/rebatch.cc
index 6e690375d7b..8d4de0fd5c4 100644
--- a/tensorflow/core/grappler/optimizers/data/rebatch.cc
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc
@@ -58,25 +58,13 @@ constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ZipDataset"
 };
 
-constexpr std::array<const char*, 17> kPassThroughOps = {
-    "CacheDataset",
-    "FilterDataset",
-    "FilterByLastComponentDataset",
-    "Identity",
-    "MapDataset",
-    "ModelDataset",
-    "OptimizeDataset",
-    "ParallelMapDataset",
-    "PrefetchDataset",
-    "ReduceDataset",
-    "RepeatDataset",
-    "ShardDataset",
-    "ShuffleAndRepeatDataset",
-    "ShuffleDataset",
-    "SkipDataset",
-    "TakeDataset",
-    "WindowDataset"
-};
+constexpr std::array<const char*, 16> kPassThroughOps = {
+    "CacheDataset",       "FilterDataset",   "Identity",
+    "MapDataset",         "ModelDataset",    "OptimizeDataset",
+    "ParallelMapDataset", "PrefetchDataset", "ReduceDataset",
+    "RepeatDataset",      "ShardDataset",    "ShuffleAndRepeatDataset",
+    "ShuffleDataset",     "SkipDataset",     "TakeDataset",
+    "WindowDataset"};
 
 constexpr std::array<const char*, 4> kFuncDatasetOps = {
     "ExperimentalGroupByWindowDataset",
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 541302361fb..e36807062f8 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 8369db7d463..00cd776c907 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -65,8 +65,10 @@ std::set<string> GetOpsFormatSupported() {
       "DepthwiseConv2dNativeBackpropFilter",
       "FusedBatchNorm",
       "FusedBatchNormV2",
+      "FusedBatchNormV3",
       "FusedBatchNormGrad",
       "FusedBatchNormGradV2",
+      "FusedBatchNormGradV3",
       "FusedConv2DBiasActivation",
       "MaxPool",
       "MaxPoolV2",
@@ -578,8 +580,8 @@ class NodeProcessor : public GraphProcessor {
     string device;
     string not_used;
     if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
-        str_util::StrContains(str_util::Lowercase(device),
-                              str_util::Lowercase(DEVICE_GPU))) {
+        absl::StrContains(absl::AsciiStrToLower(device),
+                          absl::AsciiStrToLower(DEVICE_GPU))) {
       return true;
     }
     return false;
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index ecdb7a647a3..074900c1040 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -715,7 +715,7 @@ Status BuildSwapPair(NodeDef* node, int input_to_swap,
                      std::pair<NodeDef*, NodeDef*>* swap_pair) {
   string task, device;
   if (!DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) ||
-      !str_util::StrContains(device, DEVICE_GPU)) {
+      !absl::StrContains(device, DEVICE_GPU)) {
     return errors::InvalidArgument("Can't swap input ", input_to_swap,
                                    " of node ", node->name(),
                                    " since it is not on GPU");
@@ -1238,10 +1238,10 @@ bool CrossesTaskOrCpuGpuBoundary(const NodeDef& node1, const NodeDef& node2) {
   string device2;
   DeviceNameUtils::SplitDeviceName(node2.device(), &task2, &device2);
   return task1 != task2 ||
-         (str_util::StrContains(device1, DEVICE_CPU) &&
-          str_util::StrContains(device2, DEVICE_GPU)) ||
-         (str_util::StrContains(device1, DEVICE_GPU) &&
-          str_util::StrContains(device2, DEVICE_CPU));
+         (absl::StrContains(device1, DEVICE_CPU) &&
+          absl::StrContains(device2, DEVICE_GPU)) ||
+         (absl::StrContains(device1, DEVICE_GPU) &&
+          absl::StrContains(device2, DEVICE_CPU));
 }
 
 // TODO(rmlarsen): Add distributed TF test.
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 0148fdbe775..057e5a6f5c0 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -700,7 +700,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   VLOG(1) << "Optimized " << optimized_funcs.size()
-          << " functions: " << str_util::Join(optimized_funcs, ", ");
+          << " functions: " << absl::StrJoin(optimized_funcs, ", ");
 
   if (VLOG_IS_ON(1)) {
     DumpGraphDefToFile(
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
new file mode 100644
index 00000000000..87841316fc1
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -0,0 +1,178 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/remapper.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class MklRemapperTest : public GrapplerTest {};
+
+TEST_F(MklRemapperTest, FuseConv2DWithBiasAndAddN) {
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto input_shape_addn = ops::Placeholder::Shape({8, 32, 32, 128});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto input_addn =
+      Placeholder(s.WithOpName("input_addn"), DT_FLOAT, input_shape_addn);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+  auto addn = ops::AddN(s.WithOpName("addn"),
+                        std::initializer_list<Input>{input_addn, bias_add});
+  auto fetch = ops::Identity(s.WithOpName("fetch"), addn);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto input_addn_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 128});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t},
+               {"filter", filter_t},
+               {"bias", bias_t},
+               {"input_addn", input_addn_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "addn") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(2, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+      EXPECT_EQ("input_addn", node.input(3));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(2, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      EXPECT_EQ("Add", fused_ops[1]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(MklRemapperTest, FuseConv2DWithBiasAndAddNRelu) {
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto input_shape_addn = ops::Placeholder::Shape({8, 32, 32, 128});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto input_addn =
+      Placeholder(s.WithOpName("input_addn"), DT_FLOAT, input_shape_addn);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+  auto addn = ops::AddN(s.WithOpName("addn"),
+                        std::initializer_list<Input>{input_addn, bias_add});
+  auto relu = ops::Relu(s.WithOpName("relu"), addn);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto input_addn_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 128});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t},
+               {"filter", filter_t},
+               {"bias", bias_t},
+               {"input_addn", input_addn_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "relu") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(2, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+      EXPECT_EQ("input_addn", node.input(3));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(3, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      EXPECT_EQ("Add", fused_ops[1]);
+      EXPECT_EQ("Relu", fused_ops[2]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index c79c3a86129..391b11a0f8e 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -132,7 +132,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
   }
 
   // Check if op's device is on CPU.
-  if (str_util::StrContains(node.device(), DEVICE_CPU)) {
+  if (absl::StrContains(node.device(), DEVICE_CPU)) {
     *is_candidate = true;
     return Status::OK();
   }
@@ -178,7 +178,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
 // Roughly this means checking if the input port is on Host memory.
 bool IsNodeInputPortHostFriendly(const NodeDef& node, int port_id) {
   // If node is on Host, assume its inputs are Host friendly.
-  if (str_util::StrContains(node.device(), DEVICE_CPU)) {
+  if (absl::StrContains(node.device(), DEVICE_CPU)) {
     return true;
   }
 
@@ -221,7 +221,7 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
   *is_candidate = false;
 
   // Check if node already on CPU.
-  if (str_util::StrContains(node.device(), DEVICE_CPU)) {
+  if (absl::StrContains(node.device(), DEVICE_CPU)) {
     *is_candidate = true;
     return Status::OK();
   }
@@ -271,7 +271,7 @@ string TryFindHostDevice(const gtl::FlatSet<string>& devices,
   // Force this node onto the CPU.
   if (device.empty() && has_device_cpu) {
     return "/device:CPU:0";
-  } else if (str_util::StrContains(device, DEVICE_GPU)) {
+  } else if (absl::StrContains(device, DEVICE_GPU)) {
     // Sometimes the cluster can have:
     //   devices = {"/device:CPU:0", "/device:XLA_GPU:0"}
     // and we need to handle them properly.
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index a6ab196f2eb..577ac7dd795 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -149,6 +149,45 @@ struct ContractionWithBatchNormAndActivation {
   float epsilon = 0.0;
 };
 
+#ifdef INTEL_MKL
+// Contraction node followed by a BiasAdd and Add.
+struct ContractionWithBiasAddAndAdd {
+  ContractionWithBiasAddAndAdd() = default;
+  ContractionWithBiasAddAndAdd(const NodeDef* contraction,
+                               const NodeDef* bias_add, const NodeDef* add,
+                               int port_id)
+      : contraction(contraction),
+        bias_add(bias_add),
+        add(add),
+        port_id(port_id) {}
+
+  const NodeDef* contraction = nullptr;
+  const NodeDef* bias_add = nullptr;
+  const NodeDef* add = nullptr;
+  int port_id = 0;
+};
+
+// Contraction node followed by a BiasAdd, Add and Relu.
+struct ContractionWithBiasAndAddActivation {
+  ContractionWithBiasAndAddActivation() = default;
+  ContractionWithBiasAndAddActivation(const NodeDef* contraction,
+                                      const NodeDef* bias_add,
+                                      const NodeDef* add, int port_id,
+                                      const NodeDef* activation)
+      : contraction(contraction),
+        bias_add(bias_add),
+        add(add),
+        port_id(port_id),
+        activation(activation) {}
+
+  const NodeDef* contraction = nullptr;
+  const NodeDef* bias_add = nullptr;
+  const NodeDef* add = nullptr;
+  int port_id = 0;
+  const NodeDef* activation = nullptr;
+};
+#endif  // INTEL_MKL
+
 bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
   return ctx.nodes_to_preserve.count(node->name()) > 0;
 }
@@ -260,7 +299,8 @@ bool IsGpuCompatible(const RemapperContext& ctx,
   // activation.
   bool is_relu = IsRelu(*matched.activation);
 
-  return is_relu && is_spatial_conv && IsGpuCompatibleConv2D(matched.contraction);
+  return is_relu && is_spatial_conv &&
+         IsGpuCompatibleConv2D(matched.contraction);
 }
 bool IsGpuCompatible(const RemapperContext& ctx,
                      const ContractionWithBiasAdd& matched) {
@@ -415,7 +455,8 @@ bool FindConv2DWithBatchNorm(const RemapperContext& ctx,
   if (!batch_norm || !IsFusedBatchNorm(*batch_norm)) return false;
 
   // V2 has a separate data type for the scale/offset/mean/variance inputs.
-  if (batch_norm->op() == "FusedBatchNormV2" &&
+  if ((batch_norm->op() == "FusedBatchNormV2" ||
+       batch_norm->op() == "FusedBatchNormV3") &&
       !HasDataType(batch_norm, DT_FLOAT, "U"))
     return false;
 
@@ -484,6 +525,89 @@ bool FindConv2DWithBatchNormAndActivation(
   return true;
 }
 
+#ifdef INTEL_MKL
+// As AddN has multiple inputs, this function tries to find Conv2D + Bias
+// pattern in specific input port.
+bool FindContractionWithBiasInPort(const RemapperContext& ctx,
+                                   const NodeDef* add, int port_id,
+                                   ContractionWithBiasAdd* base) {
+  // Input to AddN must match ContractionWithBiasAdd pattern.
+  const auto input_port = GraphView::InputPort(add, port_id);
+  const auto bias_add = ctx.graph_view.GetRegularFanin(input_port);
+
+  if (!FindContractionWithBias(ctx, bias_add.node, base,
+                               /*check_device_compatible=*/false) ||
+      !HasSingleFanoutNode(ctx.graph_view, base->bias_add) ||
+      !HaveSameDataType(add, base->bias_add) ||
+      IsInPreserveSet(ctx, base->bias_add))
+    return false;
+  return true;
+}
+
+bool FindContractionWithBiasAddAndAdd(const RemapperContext& ctx,
+                                      const NodeDef* add,
+                                      ContractionWithBiasAddAndAdd* matched) {
+  // Root of the pattern must be a AddN
+  if (!add || !IsAddN(*add) || HasControlFaninOrFanout(ctx.graph_view, add))
+    return false;
+
+  // Fusion with AddN is supported only when it has two inputs.
+  if (add->input_size() != 2) {
+    return false;
+  }
+
+  // MKL AddN ops only support float data type.
+  if (!HasDataType(add, DT_FLOAT)) return false;
+
+  ContractionWithBiasAdd base;
+  matched->port_id = 0;
+
+  // Find the conv+bias pattern in specific port.
+  if (!FindContractionWithBiasInPort(ctx, add, matched->port_id, &base)) {
+    matched->port_id = 1;
+    if (!FindContractionWithBiasInPort(ctx, add, matched->port_id, &base)) {
+      return false;
+    }
+  }
+
+  // We successfully found a Conv2D+BiasAdd+AddN pattern.
+  matched->contraction = base.contraction;
+  matched->bias_add = base.bias_add;
+  matched->add = add;
+
+  return true;
+}
+
+bool FindContractionWithBiasAndAddActivation(
+    const RemapperContext& ctx, const NodeDef* activation,
+    ContractionWithBiasAndAddActivation* matched) {
+  // Root of the pattern must be an activation node.
+  if (!activation || !IsSupportedActivation(*activation) ||
+      HasControlFaninOrFanout(ctx.graph_view, activation))
+    return false;
+
+  // MKL activation op only supports float data type.
+  if (!HasDataType(activation, DT_FLOAT)) return false;
+
+  // And input to activation must match ContractionWithBiasAddAndAdd pattern.
+  const auto input_port = GraphView::InputPort(activation, 0);
+  const auto add = ctx.graph_view.GetRegularFanin(input_port);
+
+  ContractionWithBiasAddAndAdd base;
+
+  if (!FindContractionWithBiasAddAndAdd(ctx, add.node, &base)) {
+    return false;
+  }
+
+  // We successfully found a Conv2D+BiasAdd+AddN+activation pattern.
+  const ContractionWithBiasAndAddActivation pattern{
+      base.contraction, base.bias_add, base.add, base.port_id, activation};
+  *matched = pattern;
+
+  return true;
+}
+#endif
+
 // Check that given node meets some basic FusedBatchNorm optimization
 // preconditions. We use this check to lazily infer graph properties which is
 // rather expensive.
@@ -728,6 +852,64 @@ void AddFusedConv2DNode(
   invalidated_nodes->insert(matched.contraction);
 }
 
+#ifdef INTEL_MKL
+void AddFusedContractionNode(
+    const ContractionWithBiasAddAndAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  // MKL version only support fusion for Conv2D
+  DCHECK(IsConv2D(*matched.contraction));
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.add->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.contraction->device());
+  fused_conv2d->add_input(matched.contraction->input(0));  // 0: input
+  fused_conv2d->add_input(matched.contraction->input(1));  // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));     // 2: bias
+
+  // Add OP has two inputs, one is conv+bias pattern matched previously,
+  // the other input to add is fused here.
+  fused_conv2d->add_input(matched.add->input(1 - matched.port_id));
+
+  CopyConv2DAttributes(matched.contraction, fused_conv2d);
+  SetFusedOpAttributes(fused_conv2d, {"BiasAdd", "Add"}, 2);
+
+  invalidated_nodes->insert(matched.add);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.contraction);
+}
+
+void AddFusedContractionNode(
+    const ContractionWithBiasAndAddActivation& matched,
+    GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  // MKL version only support fusion for Conv2D
+  DCHECK(IsConv2D(*matched.contraction));
+  // MKL version only support relu as activation
+  DCHECK(IsRelu(*matched.activation));
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.activation->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.contraction->device());
+  fused_conv2d->add_input(matched.contraction->input(0));  // 0: input
+  fused_conv2d->add_input(matched.contraction->input(1));  // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));     // 2: bias
+
+  // Add OP has two inputs, one is conv+bias pattern matched previously,
+  // the other input to add is fused here.
+  fused_conv2d->add_input(matched.add->input(1 - matched.port_id));
+
+  CopyConv2DAttributes(matched.contraction, fused_conv2d);
+  SetFusedOpAttributes(fused_conv2d, {"BiasAdd", "Add", "Relu"}, 2);
+
+  invalidated_nodes->insert(matched.activation);
+  invalidated_nodes->insert(matched.add);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.contraction);
+}
+#endif
+
 void AddBatchNormNodes(const FusedBatchNorm& matched,
                        GraphDef* optimized_graph) {
   const NodeDef& fused_node = *matched.fused_batch_norm;
@@ -883,6 +1065,10 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
   ContractionWithBatchNorm              contract_with_batch_norm;
   ContractionWithBatchNormAndActivation contract_with_batch_norm_and_activation;
   ContractionWithSqueezeAndBiasAdd      contract_with_squeeze_and_bias;
+#endif  // !INTEL_MKL
+#ifdef INTEL_MKL
+  ContractionWithBiasAddAndAdd          contract_with_bias_and_add;
+  ContractionWithBiasAndAddActivation   contract_with_bias_and_add_activation;
 #endif  // INTEL_MKL
   // clang-format on
 
@@ -911,6 +1097,26 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
     // Check if node was invalidated by one of the previous remaps.
     if (invalidated_nodes.count(&node) > 0) continue;
 
+#ifdef INTEL_MKL
+    if (!item.optimization_options().is_eager_mode) {
+      // Remap Conv2D+BiasAdd+Add+relu into the _FusedConv2D.
+      if (FindContractionWithBiasAndAddActivation(
+              ctx, &node, &contract_with_bias_and_add_activation)) {
+        AddFusedContractionNode(contract_with_bias_and_add_activation,
+                                optimized_graph, &invalidated_nodes);
+        continue;
+      }
+
+      // Remap Conv2D+BiasAdd+Add into the _FusedConv2D.
+      if (FindContractionWithBiasAddAndAdd(ctx, &node,
+                                           &contract_with_bias_and_add)) {
+        AddFusedContractionNode(contract_with_bias_and_add, optimized_graph,
+                                &invalidated_nodes);
+        continue;
+      }
+    }
+#endif  //! INTEL_MKL
+
     // Remap {Conv2D,MatMul}+BiasAdd into the _Fused{Conv2D,MatMul}
     if (allow_non_differentiable_rewrites &&
         FindContractionWithBias(ctx, &node, &contract_with_bias)) {
@@ -928,9 +1134,9 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
       continue;
     }
 
-    // NOTE: We can only fuse BatchNorm into Conv2D nodes. In theory we can do
-    // it for MatMul as well, but in practice this pattern does not appear in
-    // real Tensorflow graphs.
+// NOTE: We can only fuse BatchNorm into Conv2D nodes. In theory we can do
+// it for MatMul as well, but in practice this pattern does not appear in
+// real Tensorflow graphs.
 
 // TODO(penporn):
 // Remove this once TF-MKL supports _FusedConv2D with these operations.
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index cfcd1c0d08d..66e77e7ae72 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -138,6 +138,7 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           graph.GetRegularFanin(MutableGraphView::InputPort(&node, 0));
       const MutableGraphView::OutputPort input2 =
           graph.GetRegularFanin(MutableGraphView::InputPort(&node, 1));
+      if (input1.node == nullptr || input2.node == nullptr) continue;
       if (!IsSize(*input1.node) || !IsSize(*input2.node)) {
         continue;
       }
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 2c3ae8b9d96..0e248c83237 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
@@ -303,3 +305,73 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "graph_view_internal",
+    hdrs = ["graph_view_internal.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "graph_view_internal_test",
+    srcs = ["graph_view_internal_test.cc"],
+    deps = [
+        ":graph_view",
+        ":graph_view_internal",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "graph_view",
+    srcs = ["graph_view.cc"],
+    hdrs = ["graph_view.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_view_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "graph_view_test",
+    srcs = ["graph_view_test.cc"],
+    deps = [
+        ":graph_view",
+        ":grappler_test",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/graph_view.cc b/tensorflow/core/grappler/utils/graph_view.cc
new file mode 100644
index 00000000000..07a40396d58
--- /dev/null
+++ b/tensorflow/core/grappler/utils/graph_view.cc
@@ -0,0 +1,1685 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/graph_view.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace utils {
+
+FaninView::FaninView(NodeView* node_view, int index)
+    : NodeIndexAndPortIndex(node_view->graph_view_, node_view->node_index_,
+                            index) {}
+
+FanoutView::FanoutView(NodeView* node_view, int index)
+    : NodeIndexAndPortIndex(node_view->graph_view_, node_view->node_index_,
+                            index) {}
+
+const NodeDef* NodeView::node() const {
+  return &graph_view_->graph()->node(node_index_);
+}
+
+bool NodeView::HasFanin(const FanoutView& fanin) const {
+  if (fanin.index() < Graph::kControlSlot || graph_view_ != fanin.graph_view_) {
+    return false;
+  }
+  return fanins_set_.contains(
+      {&graph_view_->graph_->node(fanin.node_index_), fanin.index()});
+}
+
+bool NodeView::HasFanout(const FaninView& fanout) const {
+  if (fanout.index() < Graph::kControlSlot ||
+      graph_view_ != fanout.graph_view_) {
+    return false;
+  }
+  NodeView* view = fanout.node_view();
+  if (view == nullptr) {
+    return false;
+  } else if (fanout.index() == Graph::kControlSlot) {
+    return view->fanins_set_.contains({this->node(), Graph::kControlSlot});
+  } else if (fanout.index() >= view->regular_fanins_.size()) {
+    return false;
+  }
+  return view->regular_fanins_[fanout.index()].node_index_ == node_index_;
+}
+
+inline const FanoutView& NodeView::GetMissingFanin() const {
+  return graph_view_->missing_fanin_;
+}
+
+inline const std::vector<FaninView>& NodeView::GetMissingFanout() const {
+  return graph_view_->missing_fanout_;
+}
+
+namespace {
+const char kGraphViewError[] = "GraphView::GraphView error: ";
+}  // namespace
+
+GraphView::GraphView(const GraphDef* graph, Status* status)
+    : GraphViewInternal(graph) {
+  const int num_nodes = graph->node_size();
+  node_index_by_name_.reserve(num_nodes);
+  nodes_.reserve(num_nodes);
+  for (const NodeDef& node : graph->node()) {
+    if (!AddUniqueNodeInternal(&node)) {
+      *status = errors::InvalidArgument(
+          kGraphViewError, "graph has multiple nodes with the name '",
+          node.name(), "'.");
+      Reset();
+      return;
+    }
+  }
+  Status s;
+  for (NodeView& node_view : nodes_) {
+    s = CheckAndAddFaninsInternal(&node_view);
+    if (!s.ok()) {
+      *status = s;
+      Reset();
+      return;
+    }
+  }
+  *status = Status::OK();
+}
+
+bool GraphView::AddUniqueNodeInternal(const NodeDef* node) {
+  const int node_index = node_index_by_name_.size();
+  auto it = node_index_by_name_.emplace(node->name(), node_index);
+  if (it.second) {
+    nodes_.emplace_back(this, node_index);
+    return true;
+  }
+  return false;
+}
+
+Status GraphView::CheckAndAddFaninsInternal(NodeView* node_view) {
+  bool has_observed_control = false;
+  const NodeDef* node = node_view->node();
+  const string& node_name = node->name();
+  const int node_index = node_view->node_index_;
+  node_view->fanins_set_.reserve(node->input_size());
+  for (const string& input : node->input()) {
+    TensorId fanin_id = ParseTensorName(input);
+    if (fanin_id.node() == node_name) {
+      return errors::InvalidArgument(kGraphViewError, "node '", node_name,
+                                     "' has self cycle fanin '", input, "'.");
+    }
+    bool is_control = IsTensorIdControl(fanin_id);
+    if (!is_control && has_observed_control) {
+      return errors::InvalidArgument(kGraphViewError, "node '", node_name,
+                                     "' has regular fanin '", input,
+                                     "' after controlling fanins.");
+    }
+    auto it = node_index_by_name_.find(fanin_id.node());
+    if (it == node_index_by_name_.end()) {
+      return errors::InvalidArgument(kGraphViewError, "node '", node_name,
+                                     "' has missing fanin '", input, "'.");
+    }
+    const int fanin_node_index = it->second;
+    NodeView& fanin_node_view = nodes_[fanin_node_index];
+
+    if (is_control) {
+      fanin_node_view.controlled_fanouts_.emplace_back(this, node_index,
+                                                       Graph::kControlSlot);
+      node_view->controlling_fanins_.emplace_back(this, fanin_node_index,
+                                                  Graph::kControlSlot);
+      node_view->fanins_set_.emplace(fanin_node_view.node(),
+                                     Graph::kControlSlot);
+      has_observed_control = true;
+    } else {
+      if (fanin_node_view.regular_fanouts_by_port_.size() <
+          fanin_id.index() + 1) {
+        fanin_node_view.regular_fanouts_by_port_.resize(fanin_id.index() + 1);
+      }
+      fanin_node_view.regular_fanouts_by_port_[fanin_id.index()].emplace_back(
+          this, node_index, node_view->regular_fanins_.size());
+      ++fanin_node_view.num_regular_fanouts_;
+      node_view->regular_fanins_.emplace_back(this, fanin_node_index,
+                                              fanin_id.index());
+      node_view->fanins_set_.emplace(fanin_node_view.node(), fanin_id.index());
+    }
+  }
+  return Status::OK();
+}
+
+MutableFaninView::MutableFaninView(MutableNodeView* node_view, int index)
+    : NodeIndexAndPortIndex(node_view->graph_view_, node_view->node_index_,
+                            index) {}
+
+MutableFanoutView::MutableFanoutView(MutableNodeView* node_view, int index)
+    : NodeIndexAndPortIndex(node_view->graph_view_, node_view->node_index_,
+                            index) {}
+
+NodeDef* MutableNodeView::node() const {
+  return graph_view_->graph()->mutable_node(node_index_);
+}
+
+bool MutableNodeView::HasFanin(const MutableFanoutView& fanin) const {
+  if (fanin.index() < Graph::kControlSlot || graph_view_ != fanin.graph_view_) {
+    return false;
+  }
+  return fanins_count_.contains(
+      {&graph_view_->graph_->node(fanin.node_index_), fanin.index()});
+}
+
+bool MutableNodeView::HasFanout(const MutableFaninView& fanout) const {
+  if (fanout.index() < Graph::kControlSlot ||
+      graph_view_ != fanout.graph_view_) {
+    return false;
+  }
+  MutableNodeView* view = fanout.node_view();
+  if (view == nullptr) {
+    return false;
+  } else if (fanout.index() == Graph::kControlSlot) {
+    return view->fanins_count_.contains({this->node(), Graph::kControlSlot});
+  } else if (fanout.index() >= view->regular_fanins_.size()) {
+    return false;
+  }
+  return view->regular_fanins_[fanout.index()].node_index_ == node_index_;
+}
+
+const MutableFanoutView& MutableNodeView::GetMissingFanin() const {
+  return graph_view_->missing_fanin_;
+}
+
+const std::vector<MutableFaninView>& MutableNodeView::GetMissingFanout() const {
+  return graph_view_->missing_fanout_;
+}
+
+namespace {
+const char kMutationAddNodeError[] = "Mutation::AddNode error: ";
+
+bool IsTensorIdRegular(const TensorId& tensor_id) {
+  return tensor_id.index() >= 0;
+}
+}  // namespace
+
+Mutation::Mutation(MutableGraphView* graph_view) : graph_view_(graph_view) {}
+
+MutationNewNode Mutation::AddNode(NodeDef&& node, Status* status) {
+  bool has_observed_control = false;
+  const string& node_name = node.name();
+  std::vector<SafeTensorId> regular_fanins;
+  absl::flat_hash_set<string> controlling_fanins;
+  const int num_fanins = node.input_size();
+  for (int i = 0; i < num_fanins; ++i) {
+    const string& input = node.input(i);
+    TensorId fanin_id = ParseTensorName(input);
+    if (fanin_id.node() == node_name) {
+      *status =
+          errors::InvalidArgument(kMutationAddNodeError, "node '", node_name,
+                                  "' has self cycle fanin '", input, "'.");
+      return MutationNewNode(this, mutation_counter_, internal::kMissingIndex);
+    }
+    bool is_control = IsTensorIdControl(fanin_id);
+    if (is_control) {
+      has_observed_control = true;
+      controlling_fanins.emplace(fanin_id.node());
+    } else if (has_observed_control) {
+      *status = errors::InvalidArgument(kMutationAddNodeError, "node '",
+                                        node_name, "' has regular fanin '",
+                                        input, "' after controlling fanins.");
+      return MutationNewNode(this, mutation_counter_, internal::kMissingIndex);
+    } else {
+      regular_fanins.emplace_back(fanin_id);
+    }
+  }
+
+  node.mutable_input()->Clear();
+  new_nodes_.emplace_back(graph_view_, std::move(node));
+  MutationNewNodeHolder& mutation_node = new_nodes_.back();
+  mutation_node.regular_fanins = std::move(regular_fanins);
+  mutation_node.num_regular_fanins = mutation_node.regular_fanins.size();
+  mutation_node.controlling_fanins = std::move(controlling_fanins);
+  *status = Status::OK();
+  return MutationNewNode(this, mutation_counter_, new_nodes_.size() - 1);
+}
+
+void Mutation::AddMutation(
+    MutableNodeView* node,
+    std::function<void(MutableNodeViewDiff*)> mutate_fn) {
+  DCHECK(node->graph_view_ == graph_view_);
+  if (node->update_index_ == internal::kMissingIndex) {
+    node->update_index_ = updated_nodes_.size();
+    updated_nodes_.emplace_back(graph_view_, node->node_index_);
+    mutate_fn(&updated_nodes_.back());
+  } else {
+    auto& diff = updated_nodes_[node->update_index_];
+    if (!diff.removed) {
+      mutate_fn(&diff);
+    }
+  }
+}
+
+void Mutation::RemoveNode(MutableNodeView* node) {
+  AddMutation(node, [](MutableNodeViewDiff* diff) {
+    // Clear existing MutableNodeViewDiff as when node is removed no change to
+    // its internal state matter.
+    internal::Reset(diff);
+    internal::SetRemoved(diff, true);
+  });
+}
+
+void Mutation::UpdateNodeName(MutableNodeView* node, absl::string_view name) {
+  AddMutation(node, [name](MutableNodeViewDiff* diff) {
+    internal::UpdateName(diff, name);
+  });
+}
+
+void Mutation::UpdateNodeName(const MutationNewNode& node,
+                              absl::string_view name) {
+  DCHECK(node.mutation_ == this && node.mutation_counter_ == mutation_counter_);
+  internal::UpdateName(&new_nodes_[node.index_], name);
+}
+
+void Mutation::UpdateNodeOp(MutableNodeView* node, absl::string_view op) {
+  AddMutation(
+      node, [op](MutableNodeViewDiff* diff) { internal::UpdateOp(diff, op); });
+}
+
+void Mutation::UpdateNodeOp(const MutationNewNode& node, absl::string_view op) {
+  DCHECK(node.mutation_ == this && node.mutation_counter_ == mutation_counter_);
+  internal::UpdateOp(&new_nodes_[node.index_], op);
+}
+
+void Mutation::UpdateNodeDevice(MutableNodeView* node,
+                                absl::string_view device) {
+  AddMutation(node, [device](MutableNodeViewDiff* diff) {
+    internal::UpdateDevice(diff, device);
+  });
+}
+
+void Mutation::UpdateNodeDevice(const MutationNewNode& node,
+                                absl::string_view device) {
+  DCHECK(node.mutation_ == this && node.mutation_counter_ == mutation_counter_);
+  internal::UpdateDevice(&new_nodes_[node.index_], device);
+}
+
+void Mutation::AddOrUpdateRegularFanin(MutableNodeView* node, int index,
+                                       const TensorId& fanin) {
+  AddMutation(node, [index, fanin](MutableNodeViewDiff* diff) {
+    internal::AddOrUpdateRegularFanin(diff, index, fanin);
+  });
+}
+
+void Mutation::AddOrUpdateRegularFanin(const MutationNewNode& node, int index,
+                                       const TensorId& fanin) {
+  DCHECK(node.mutation_ == this &&
+         node.mutation_counter_ == mutation_counter_ && index >= 0 &&
+         IsTensorIdRegular(fanin));
+  internal::AddOrUpdateRegularFanin(&new_nodes_[node.index_], index, fanin);
+}
+
+void Mutation::RemoveRegularFanin(MutableNodeView* node, int index) {
+  AddMutation(node, [index](MutableNodeViewDiff* diff) {
+    internal::RemoveRegularFanin(diff, index);
+  });
+}
+
+void Mutation::RemoveRegularFanin(const MutationNewNode& node, int index) {
+  DCHECK(node.mutation_ == this &&
+         node.mutation_counter_ == mutation_counter_ && index >= 0);
+  internal::RemoveRegularFanin(&new_nodes_[node.index_], index);
+}
+
+void Mutation::AddControllingFanin(MutableNodeView* node,
+                                   absl::string_view fanin_node_name) {
+  AddMutation(node, [node, fanin_node_name](MutableNodeViewDiff* diff) {
+    auto it = node->controlling_fanins_index_.find(fanin_node_name);
+    const int control_index = it != node->controlling_fanins_index_.end()
+                                  ? it->second
+                                  : internal::kMissingIndex;
+    internal::AddControllingFanin(diff, control_index, fanin_node_name);
+  });
+}
+
+void Mutation::AddControllingFanin(const MutationNewNode& node,
+                                   absl::string_view fanin_node_name) {
+  DCHECK(node.mutation_ == this && node.mutation_counter_ == mutation_counter_);
+  internal::AddControllingFanin(&new_nodes_[node.index_], fanin_node_name);
+}
+
+void Mutation::RemoveControllingFanin(MutableNodeView* node,
+                                      absl::string_view fanin_node_name) {
+  AddMutation(node, [node, fanin_node_name](MutableNodeViewDiff* diff) {
+    auto it = node->controlling_fanins_index_.find(fanin_node_name);
+    const int control_index = it != node->controlling_fanins_index_.end()
+                                  ? it->second
+                                  : internal::kMissingIndex;
+    internal::RemoveControllingFanin(diff, control_index, fanin_node_name);
+  });
+}
+
+void Mutation::RemoveControllingFanin(const MutationNewNode& node,
+                                      absl::string_view fanin_node_name) {
+  DCHECK(node.mutation_ == this && node.mutation_counter_ == mutation_counter_);
+  internal::RemoveControllingFanin(&new_nodes_[node.index_], fanin_node_name);
+}
+
+void Mutation::AddOrUpdateNodeAttr(MutableNodeView* node,
+                                   absl::string_view attr_name,
+                                   const AttrValue& attr_value) {
+  AddMutation(node, [attr_name, attr_value](MutableNodeViewDiff* diff) {
+    internal::AddOrUpdateAttribute(diff, attr_name, attr_value);
+  });
+}
+
+void Mutation::AddOrUpdateNodeAttr(const MutationNewNode& node,
+                                   absl::string_view attr_name,
+                                   const AttrValue& attr_value) {
+  DCHECK(node.mutation_ == this && node.mutation_counter_ == mutation_counter_);
+  internal::AddOrUpdateAttribute(&new_nodes_[node.index_], attr_name,
+                                 attr_value);
+}
+
+void Mutation::RemoveNodeAttr(MutableNodeView* node,
+                              absl::string_view attr_name) {
+  AddMutation(node, [attr_name](MutableNodeViewDiff* diff) {
+    internal::RemoveAttribute(diff, attr_name);
+  });
+}
+
+void Mutation::RemoveNodeAttr(const MutationNewNode& node,
+                              absl::string_view attr_name) {
+  DCHECK(node.mutation_ == this && node.mutation_counter_ == mutation_counter_);
+  internal::RemoveAttribute(&new_nodes_[node.index_], attr_name);
+}
+
+void Mutation::ResetInternal() {
+  std::vector<MutableNodeViewDiff>().swap(updated_nodes_);
+  std::vector<MutationNewNodeHolder>().swap(new_nodes_);
+}
+
+void Mutation::Reset() {
+  for (const auto& update : updated_nodes_) {
+    graph_view_->nodes_[update.node_index].update_index_ =
+        internal::kMissingIndex;
+  }
+  ResetInternal();
+}
+
+Status Mutation::Apply() { return graph_view_->ApplyMutationInternal(); }
+
+namespace {
+const char kMutableGraphViewError[] =
+    "MutableGraphView::MutableGraphView error: ";
+
+const char kMutableGraphViewApplyError[] = "Mutation::Apply error: ";
+
+inline void IncrementFaninCount(
+    absl::flat_hash_map<internal::NodeDefAndPortIndex, int>* fanins_count,
+    const internal::NodeDefAndPortIndex& fanin) {
+  ++(*fanins_count)[fanin];
+}
+
+inline void DecrementFaninCount(
+    absl::flat_hash_map<internal::NodeDefAndPortIndex, int>* fanins_count,
+    const internal::NodeDefAndPortIndex& fanin) {
+  auto it = fanins_count->find(fanin);
+  if (it != fanins_count->end()) {
+    if (it->second <= 1) {
+      fanins_count->erase(it);
+    } else {
+      --it->second;
+    }
+  }
+}
+}  // namespace
+
+MutableGraphView::MutableGraphView(GraphDef* graph, Status* status)
+    : GraphViewInternal(graph), mutation_(Mutation(this)) {
+  const int num_nodes = graph->node_size();
+  node_index_by_name_.reserve(num_nodes);
+  nodes_.reserve(num_nodes);
+  for (NodeDef& node : *graph->mutable_node()) {
+    if (!AddUniqueNodeInternal(&node)) {
+      *status = errors::InvalidArgument(
+          kMutableGraphViewError, "graph has multiple nodes with the name '",
+          node.name(), "'.");
+      Reset();
+      return;
+    }
+  }
+  std::vector<std::vector<TensorId>> fanins;
+  Status s = CheckFaninsInternal(&fanins);
+  if (!s.ok()) {
+    *status = s;
+    Reset();
+    return;
+  }
+  AddFaninsInternal(&fanins);
+  *status = Status::OK();
+}
+
+Mutation* MutableGraphView::GetMutationBuilder() { return &mutation_; }
+
+bool MutableGraphView::AddUniqueNodeInternal(NodeDef* node) {
+  const int node_index = node_index_by_name_.size();
+  auto it = node_index_by_name_.emplace(node->name(), node_index);
+  if (it.second) {
+    nodes_.emplace_back(this, node_index);
+    return true;
+  }
+  return false;
+}
+
+Status MutableGraphView::CheckFaninsInternal(
+    std::vector<std::vector<TensorId>>* fanins) {
+  const int num_nodes = nodes_.size();
+  fanins->reserve(num_nodes);
+  for (int i = 0; i < num_nodes; ++i) {
+    bool has_observed_control = false;
+    const NodeDef* node = nodes_[i].node();
+    const string& node_name = node->name();
+    std::vector<TensorId> node_fanins;
+    node_fanins.reserve(node->input_size());
+    for (const string& input : node->input()) {
+      TensorId fanin_id = ParseTensorName(input);
+      if (fanin_id.node() == node_name) {
+        return errors::InvalidArgument(kMutableGraphViewError, "node '",
+                                       node_name, "' has self cycle fanin '",
+                                       input, "'.");
+      }
+      bool is_control = IsTensorIdControl(fanin_id);
+      if (!is_control && has_observed_control) {
+        return errors::InvalidArgument(kMutableGraphViewError, "node '",
+                                       node_name, "' has regular fanin '",
+                                       input, "' after controlling fanins.");
+      }
+      if (!node_index_by_name_.contains(fanin_id.node())) {
+        return errors::InvalidArgument(kMutableGraphViewError, "node '",
+                                       node_name, "' has missing fanin '",
+                                       input, "'.");
+      }
+      if (is_control) {
+        has_observed_control = true;
+      }
+      node_fanins.push_back(std::move(fanin_id));
+    }
+    fanins->push_back(std::move(node_fanins));
+  }
+  return Status::OK();
+}
+
+void MutableGraphView::AddFaninsInternal(
+    std::vector<std::vector<TensorId>>* fanins) {
+  const int num_nodes = nodes_.size();
+  for (int i = 0; i < num_nodes; ++i) {
+    MutableNodeView& node_view = nodes_[i];
+    NodeDef* node = node_view.node();
+    std::vector<TensorId>& node_fanins = fanins->at(i);
+    absl::flat_hash_set<absl::string_view> observed_controls;
+    int pos = 0;
+    const int last_idx = node_fanins.size() - 1;
+    int last_pos = last_idx;
+    node_view.fanins_count_.reserve(node->input_size());
+    node_view.controlling_fanins_index_.reserve(node->input_size());
+    while (pos <= last_pos) {
+      const TensorId& fanin_id = node_fanins[pos];
+      bool is_control = IsTensorIdControl(fanin_id);
+      const int fanin_node_index = node_index_by_name_[fanin_id.node()];
+      MutableNodeView& fanin_node_view = nodes_[fanin_node_index];
+
+      if (is_control) {
+        if (gtl::InsertIfNotPresent(&observed_controls, fanin_id.node())) {
+          fanin_node_view.controlled_fanouts_.emplace_back(
+              this, i, Graph::kControlSlot,
+              node_view.controlling_fanins_.size());
+          node_view.controlling_fanins_.emplace_back(
+              this, fanin_node_index, Graph::kControlSlot,
+              fanin_node_view.controlled_fanouts_.size() - 1);
+          IncrementFaninCount(
+              &node_view.fanins_count_,
+              {&graph_->node(fanin_node_index), Graph::kControlSlot});
+          node_view.controlling_fanins_index_.emplace(
+              fanin_id.node(), pos - node_view.NumRegularFanins());
+          ++pos;
+        } else {
+          node->mutable_input()->SwapElements(pos, last_pos);
+          std::swap(node_fanins[pos], node_fanins[last_pos]);
+          --last_pos;
+        }
+      } else {
+        if (fanin_node_view.regular_fanouts_by_port_.size() <
+            fanin_id.index() + 1) {
+          fanin_node_view.regular_fanouts_by_port_.resize(fanin_id.index() + 1);
+        }
+        auto& fanin_regular_fanouts =
+            fanin_node_view.regular_fanouts_by_port_[fanin_id.index()];
+        fanin_regular_fanouts.emplace_back(this, i,
+                                           node_view.regular_fanins_.size(),
+                                           node_view.regular_fanins_.size());
+        ++fanin_node_view.num_regular_fanouts_;
+        node_view.regular_fanins_.emplace_back(
+            this, fanin_node_index, fanin_id.index(),
+            fanin_regular_fanouts.size() - 1);
+        IncrementFaninCount(
+            &node_view.fanins_count_,
+            {&graph_->node(fanin_node_index), fanin_id.index()});
+        ++pos;
+      }
+    }
+    if (last_pos < last_idx) {
+      node->mutable_input()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
+    }
+  }
+}
+
+Status MutableGraphView::GetNodeNamesAndPartitionUpdatedNodes(
+    absl::flat_hash_map<absl::string_view, int>* node_names,
+    std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+    std::vector<int>* inplace_nodes,
+    std::vector<int>* empty_diff_node_indices) {
+  for (const auto& diff : mutation_.updated_nodes_) {
+    // For all nodes to be removed and renamed, mark their original names as
+    // missing and put associated node index in graph.
+    if (diff.removed || diff.update_name) {
+      const int index = diff.node_index;
+      const string& node_name = nodes_[index].GetName();
+      node_names->emplace(node_name, index);
+    }
+  }
+
+  auto name_conflict = [](const absl::string_view node_name) {
+    return errors::InvalidArgument(kMutableGraphViewApplyError,
+                                   "multiple nodes with the name: '", node_name,
+                                   "' exists in Mutation.");
+  };
+
+  // Partition updated nodes by if they will be renamed or not.
+  const int num_updated_nodes = mutation_.updated_nodes_.size();
+  renamed_nodes->reserve(num_updated_nodes);
+  inplace_nodes->reserve(num_updated_nodes);
+  empty_diff_node_indices->reserve(num_updated_nodes);
+  for (int i = 0; i < num_updated_nodes; ++i) {
+    auto& diff = mutation_.updated_nodes_[i];
+    if (internal::IsEmpty(&diff)) {
+      empty_diff_node_indices->emplace_back(diff.node_index);
+      continue;
+    } else if (diff.removed) {
+      continue;
+    }
+    // Get name of updated node after potential mutation.
+    const string& node_name =
+        diff.update_name ? diff.name : nodes_[diff.node_index].GetName();
+    auto it = node_names->insert({node_name, internal::kNodeNamePresent});
+    if (!it.second) {
+      if (it.first->second == internal::kNodeNamePresent) {
+        // Another node in the mutation is already using this name, which will
+        // result in a conflict.
+        return name_conflict(node_name);
+      } else {
+        // Mark name as present (node was marked missing from either being
+        // removed or renamed).
+        it.first->second = internal::kNodeNamePresent;
+      }
+    }
+    if (diff.update_name) {
+      // Lookup new name of node in current graph. If a node has such name,
+      // store its index for later lookups as this node will be overwritten.
+      auto node_name_it = node_index_by_name_.find(node_name);
+      const int overwritten_node_index =
+          node_name_it != node_index_by_name_.end() ? node_name_it->second
+                                                    : internal::kMissingIndex;
+      renamed_nodes->emplace_back(i, overwritten_node_index);
+    } else {
+      inplace_nodes->push_back(i);
+    }
+  }
+
+  // Get names of new nodes after potential mutation.
+  for (const auto& new_node : mutation_.new_nodes_) {
+    const string& node_name = new_node.node.name();
+    auto it = node_names->insert({node_name, internal::kNodeNamePresent});
+    if (it.second) {
+      continue;
+    }
+    if (it.first->second == internal::kNodeNamePresent) {
+      // Another node in the mutation is already using this name, which will
+      // result in a conflict.
+      return name_conflict(node_name);
+    } else {
+      // Mark name as present (node was marked missing from either being removed
+      // or renamed).
+      it.first->second = internal::kNodeNamePresent;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::RemovedOrMissingNodeFanoutsWellFormed(
+    const absl::flat_hash_map<absl::string_view, int>& node_names,
+    const std::vector<RenamedOrOverwrittenNode>& renamed_nodes) {
+  auto bad_fanout = [](absl::string_view fanout_node_name,
+                       absl::string_view node_name) {
+    return errors::InvalidArgument(
+        kMutableGraphViewApplyError, "fanout '", fanout_node_name,
+        "' exist for missing node '", node_name, "'.");
+  };
+
+  // Lookup nodes to be overwritten.
+  std::vector<bool> overwritten_nodes(NumNodes());
+  for (auto& renamed_node : renamed_nodes) {
+    if (renamed_node.overwritten_node_index_ == internal::kMissingIndex) {
+      continue;
+    }
+    overwritten_nodes[renamed_node.overwritten_node_index_] = true;
+  }
+
+  // Check if removed nodes and previous state of renamed nodes have no fanouts.
+  for (const auto& node_name_state : node_names) {
+    if (node_name_state.second == internal::kNodeNamePresent) {
+      continue;
+    }
+    const MutableNodeView& node_view = nodes_[node_name_state.second];
+    for (const auto& regular_fanouts : node_view.GetRegularFanouts()) {
+      for (const auto& regular_fanout : regular_fanouts) {
+        // Check all fanouts of a single port.
+        MutableNodeView* fanout_view = regular_fanout.node_view();
+        if (fanout_view->update_index_ == internal::kMissingIndex) {
+          if (!overwritten_nodes[fanout_view->node_index_]) {
+            // Fanout is not updated or removed/overwritten.
+            return bad_fanout(fanout_view->GetName(), node_name_state.first);
+          }
+        } else {
+          auto& diff = mutation_.updated_nodes_[fanout_view->update_index_];
+          if (diff.removed) {
+            // Fanout node will be removed, this can be ignored.
+            continue;
+          }
+          const int last_index = fanout_view->NumRegularFanins() -
+                                 diff.num_regular_inputs_to_remove - 1;
+          if (regular_fanout.index() > last_index) {
+            // Fanin of fanout is removed, this can be ignored.
+            continue;
+          }
+          // Check if fanin is updated.
+          if (diff.regular_inputs_to_update.find(regular_fanout.index()) ==
+              diff.regular_inputs_to_update.end()) {
+            return bad_fanout(fanout_view->GetName(), node_name_state.first);
+          }
+        }
+      }
+    }
+    for (const auto& controlled_fanout : node_view.GetControlledFanouts()) {
+      MutableNodeView* fanout_view = controlled_fanout.node_view();
+      if (fanout_view->update_index_ == internal::kMissingIndex) {
+        if (!overwritten_nodes[fanout_view->node_index_]) {
+          // Fanout is not updated or removed/overwritten.
+          return bad_fanout(fanout_view->GetName(), node_name_state.first);
+        }
+      } else {
+        auto& diff = mutation_.updated_nodes_[fanout_view->update_index_];
+        if (diff.removed) {
+          // Fanout node will be removed, this can be ignored.
+          continue;
+        }
+        // Check if controlling fanin is removed.
+        if (diff.controlling_inputs_to_remove.find(
+                controlled_fanout.fanin_index_) ==
+            diff.controlling_inputs_to_remove.end()) {
+          return bad_fanout(fanout_view->GetName(), node_name_state.first);
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::CheckNodeNamesAndFanins(
+    const absl::flat_hash_map<absl::string_view, int>& node_names,
+    const std::vector<RenamedOrOverwrittenNode>& renamed_nodes,
+    const std::vector<int>& inplace_nodes) {
+  // Check if removed/missing node fanouts are valid.
+  TF_RETURN_IF_ERROR(
+      RemovedOrMissingNodeFanoutsWellFormed(node_names, renamed_nodes));
+
+  // Check if updated nodes and their fanins are valid.
+  for (auto& inplace_node : inplace_nodes) {
+    auto& diff = mutation_.updated_nodes_[inplace_node];
+    if (!internal::IsWellFormed(&diff, node_names)) {
+      return errors::InvalidArgument(
+          kMutableGraphViewApplyError, "inplace updated node '",
+          nodes_[diff.node_index].GetName(), "' is ill-formed.");
+    }
+  }
+  for (auto& renamed_node : renamed_nodes) {
+    auto& diff = mutation_.updated_nodes_[renamed_node.renamed_update_index_];
+    if (!internal::IsWellFormed(&diff, node_names)) {
+      return errors::InvalidArgument(
+          kMutableGraphViewApplyError, "renamed updated node '", diff.name,
+          "' ('", nodes_[diff.node_index].GetName(), "') is ill-formed.");
+    }
+  }
+
+  // Check if new nodes and their fanins are valid.
+  for (auto& new_node : mutation_.new_nodes_) {
+    if (!internal::IsWellFormed(&new_node, node_names)) {
+      return errors::InvalidArgument(kMutableGraphViewApplyError, "new node '",
+                                     new_node.node.name(), "' is ill-formed.");
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::CheckKernelRegisteredForNodes() {
+  Status s;
+  for (auto& diff : mutation_.updated_nodes_) {
+    if (internal::IsEmpty(&diff) || diff.removed) {
+      continue;
+    }
+
+    NodeDef* node = nodes_[diff.node_index].node();
+    diff.processed_attrs =
+        AttrValueMap(node->attr().begin(), node->attr().end());
+    for (const auto& attr_to_remove : diff.attrs_to_remove) {
+      diff.processed_attrs.erase(attr_to_remove);
+    }
+    for (const auto& attr_to_add : diff.attrs_to_add) {
+      gtl::InsertOrUpdate(&diff.processed_attrs, attr_to_add.first,
+                          attr_to_add.second);
+    }
+    const string& device = diff.update_device ? diff.device : node->device();
+    if (device.empty()) {
+      continue;
+    }
+    s = IsKernelRegisteredForNode(diff.update_name ? diff.name : node->name(),
+                                  node->has_experimental_debug_info(),
+                                  node->experimental_debug_info(),
+                                  diff.update_op ? diff.op : node->op(), device,
+                                  AttrSlice(&diff.processed_attrs));
+    if (!s.ok()) {
+      return errors::InvalidArgument(kMutableGraphViewApplyError,
+                                     s.error_message());
+    }
+  }
+  for (const auto& new_node_holder : mutation_.new_nodes_) {
+    const auto& new_node_def = new_node_holder.node;
+    if (new_node_def.device().empty()) {
+      continue;
+    }
+    s = IsKernelRegisteredForNode(new_node_def);
+    if (!s.ok()) {
+      return errors::InvalidArgument(kMutableGraphViewApplyError,
+                                     s.error_message());
+    }
+  }
+  return Status::OK();
+}
+
+template <typename T>
+void MutableGraphView::ReplaceNodeFanouts(MutableNodeView* node, T* fanouts) {
+  node->num_regular_fanouts_ = fanouts->num_regular_fanouts_;
+  node->regular_fanouts_by_port_ = std::move(fanouts->regular_fanouts_by_port_);
+  for (int i = 0; i < node->regular_fanouts_by_port_.size(); ++i) {
+    for (int j = 0; j < node->regular_fanouts_by_port_[i].size(); ++j) {
+      auto& fanout = node->regular_fanouts_by_port_[i][j];
+      auto* fanout_node_view = fanout.node_view();
+      auto& fanout_fanin = fanout_node_view->regular_fanins_[fanout.index()];
+      auto* fanout_fanins_count = &fanout_node_view->fanins_count_;
+      DecrementFaninCount(
+          fanout_fanins_count,
+          {&graph_->node(fanout_fanin.node_index_), fanout_fanin.index()});
+      fanout_fanin.node_index_ = node->node_index_;
+      IncrementFaninCount(
+          fanout_fanins_count,
+          {&graph_->node(node->node_index_), fanout_fanin.index()});
+    }
+  }
+  node->controlled_fanouts_ = std::move(fanouts->controlled_fanouts_);
+  for (int i = 0; i < node->controlled_fanouts_.size(); ++i) {
+    auto& fanout = node->controlled_fanouts_[i];
+    auto* fanout_node_view = fanout.node_view();
+    auto& fanout_fanin =
+        fanout_node_view->controlling_fanins_[fanout.fanin_index_];
+    auto* fanout_fanins_count = &fanout_node_view->fanins_count_;
+    DecrementFaninCount(
+        fanout_fanins_count,
+        {&graph_->node(fanout_fanin.node_index_), Graph::kControlSlot});
+    fanout_fanin.node_index_ = node->node_index_;
+    fanout_fanin.fanout_index_ = i;
+    IncrementFaninCount(fanout_fanins_count, {&graph_->node(node->node_index_),
+                                              Graph::kControlSlot});
+  }
+}
+
+void MutableGraphView::FixRenamedNodes(
+    std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+    absl::flat_hash_map<string, NodeViewFanouts>* renamed_fanouts,
+    std::vector<bool>* overwritten_name_removed_nodes) {
+  // Extract all renamed node fanouts.
+  renamed_fanouts->reserve(renamed_nodes->size());
+  for (auto& renamed : *renamed_nodes) {
+    auto& diff = mutation_.updated_nodes_[renamed.renamed_update_index_];
+    // Remove node index by name from graph.
+    node_index_by_name_.erase(nodes_[diff.node_index].GetName());
+    MutableNodeView& renamed_node = nodes_[diff.node_index];
+    renamed_fanouts->try_emplace(
+        renamed_node.GetName(),
+        std::move(renamed_node.regular_fanouts_by_port_),
+        renamed_node.num_regular_fanouts_,
+        std::move(renamed_node.controlled_fanouts_));
+  }
+
+  // Replace renamed node fanouts with fanouts associated with updated name.
+  for (auto& renamed : *renamed_nodes) {
+    auto& diff = mutation_.updated_nodes_[renamed.renamed_update_index_];
+    MutableNodeView& renamed_node = nodes_[diff.node_index];
+    auto fanouts_it = renamed_fanouts->find(diff.name);
+    if (fanouts_it != renamed_fanouts->end()) {
+      // Another renamed node's fanout.
+      auto& fanouts = fanouts_it->second;
+      ReplaceNodeFanouts(&renamed_node, &fanouts);
+      renamed_fanouts->erase(fanouts_it);
+      // Node to be overwritten is being renamed, so it won't be overwritten.
+      renamed.overwritten_node_index_ = internal::kMissingIndex;
+    } else if (renamed.overwritten_node_index_ != internal::kMissingIndex) {
+      // Existing node in graph.
+      MutableNodeView& node_to_overwrite =
+          nodes_[renamed.overwritten_node_index_];
+      ReplaceNodeFanouts(&renamed_node, &node_to_overwrite);
+      node_index_by_name_.erase(node_to_overwrite.GetName());
+      if (node_to_overwrite.update_index_ != internal::kMissingIndex &&
+          mutation_.updated_nodes_[node_to_overwrite.update_index_].removed) {
+        (*overwritten_name_removed_nodes)[node_to_overwrite.update_index_] =
+            true;
+      }
+    } else {
+      // No existing fanouts.
+      renamed_node.num_regular_fanouts_ = 0;
+    }
+
+    // Update node name.
+    renamed_node.node()->set_name(diff.name);
+    diff.update_name = false;
+    diff.name.clear();
+    // Rehash renamed nodes with updated name.
+    node_index_by_name_.emplace(renamed_node.GetName(), diff.node_index);
+  }
+}
+
+void MutableGraphView::AddNewNodes(
+    absl::flat_hash_map<string, NodeViewFanouts>* renamed_fanouts,
+    std::vector<int>* new_node_indices) {
+  new_node_indices->reserve(mutation_.new_nodes_.size());
+  for (auto& new_node : mutation_.new_nodes_) {
+    int node_index;
+    auto graph_it = node_index_by_name_.find(new_node.node.name());
+    if (graph_it != node_index_by_name_.end()) {
+      // Overwrite existing node.
+      node_index = graph_it->second;
+      MutableNodeView& node_view = nodes_[node_index];
+      RemoveAllFaninFanoutInternal(&node_view);
+      auto* node_def = graph_->mutable_node(node_index);
+      node_def->mutable_op()->swap(*new_node.node.mutable_op());
+      node_def->mutable_device()->swap(*new_node.node.mutable_device());
+      node_def->mutable_input()->Clear();
+      node_def->mutable_attr()->swap(*new_node.node.mutable_attr());
+      if (node_view.update_index_ != internal::kMissingIndex) {
+        // The only case for this to occur is if a node is explicitly marked for
+        // removal. In that case, unlink it from it's associated
+        // MutableNodeViewDiff.
+        mutation_.updated_nodes_[node_view.update_index_].node_index =
+            internal::kMissingIndex;
+        mutation_.updated_nodes_[node_view.update_index_].removed = false;
+        node_view.update_index_ = internal::kMissingIndex;
+      }
+    } else {
+      // New node.
+      auto* new_node_def = graph_->add_node();
+      *new_node_def = std::move(new_node.node);
+      node_index = nodes_.size();
+      nodes_.emplace_back(this, node_index);
+      MutableNodeView& new_node_view = nodes_.back();
+      auto it = renamed_fanouts->find(new_node_view.GetName());
+      if (it != renamed_fanouts->end()) {
+        // Reuse fanouts of renamed node.
+        NodeViewFanouts& fanouts = it->second;
+        ReplaceNodeFanouts(&new_node_view, &fanouts);
+        renamed_fanouts->erase(it);
+      }
+      node_index_by_name_.emplace(new_node_view.GetName(), node_index);
+    }
+    new_node_indices->emplace_back(node_index);
+  }
+}
+
+void MutableGraphView::FixRenamedFanouts(
+    const absl::flat_hash_map<string, NodeViewFanouts>& renamed_fanouts) {
+  // Leftover fanouts in renamed_fanouts are due to nodes not existing anymore
+  // or a node being renamed without another node taking its place. For these
+  // leftover fanouts, mark their respective fanin fanout_index_ to
+  // internal::kMissingIndex as an indicator so when it comes to updating or
+  // removing fanins inplace, nodes with the same index don't get affected and
+  // other fanouts are accidently removed.
+  for (auto& renamed_fanout : renamed_fanouts) {
+    for (auto& regular_fanouts :
+         renamed_fanout.second.regular_fanouts_by_port_) {
+      for (auto& fanout : regular_fanouts) {
+        auto* fanout_node_view = fanout.node_view();
+        auto& fanin = fanout_node_view->regular_fanins_[fanout.index()];
+        fanout_node_view->fanins_count_.erase(
+            {fanin.node_view()->node(), fanin.index()});
+        fanin.fanout_index_ = internal::kMissingIndex;
+      }
+    }
+    for (auto& fanout : renamed_fanout.second.controlled_fanouts_) {
+      auto* fanout_node_view = fanout.node_view();
+      auto& fanin = fanout_node_view->controlling_fanins_[fanout.fanin_index_];
+      fanout_node_view->fanins_count_.erase(
+          {fanin.node_view()->node(), Graph::kControlSlot});
+      fanout_node_view->controlling_fanins_index_.erase(renamed_fanout.first);
+      fanin.fanout_index_ = internal::kMissingIndex;
+    }
+  }
+}
+
+inline void MutableGraphView::RemoveRegularFaninFanoutInternal(
+    MutableNodeView* node_view, int i) {
+  MutableFanoutView& fanin = node_view->regular_fanins_[i];
+  // Fanin was marked as removed via FixRenamedFanouts.
+  if (fanin.fanout_index_ == internal::kMissingIndex) {
+    return;
+  }
+
+  DecrementFaninCount(&node_view->fanins_count_,
+                      {&graph_->node(fanin.node_index_), fanin.index()});
+  auto* fanin_node_view = fanin.node_view();
+  auto& fanouts = fanin_node_view->regular_fanouts_by_port_[fanin.index()];
+  if (fanin.fanout_index_ < fanouts.size() - 1) {
+    // Swap fanout with last fanout in vector, and update it's associated fanin
+    // index.
+    MutableFaninView& last_fanout = fanouts.back();
+    last_fanout.node_view()
+        ->regular_fanins_[last_fanout.index()]
+        .fanout_index_ = fanin.fanout_index_;
+    std::swap(last_fanout, fanouts[fanin.fanout_index_]);
+  }
+  // Remove fanout.
+  fanouts.pop_back();
+  --fanin.node_view()->num_regular_fanouts_;
+
+  // Resize fanouts. Fanouts may not be removed sequentially in relation to
+  // output port, so trailing empty output ports may be left behind. It is
+  // necessary to loop through all of the output ports to determine the maximum
+  // output port before resizing.
+  int last_fanout_index = fanin_node_view->regular_fanouts_by_port_.size();
+  for (int i = fanin_node_view->regular_fanouts_by_port_.size() - 1; i >= 0;
+       --i) {
+    if (fanin_node_view->regular_fanouts_by_port_[i].empty()) {
+      last_fanout_index = i;
+    } else {
+      break;
+    }
+  }
+  if (last_fanout_index < fanin_node_view->regular_fanouts_by_port_.size()) {
+    fanin_node_view->regular_fanouts_by_port_.resize(last_fanout_index);
+  }
+}
+
+inline void MutableGraphView::AddRegularFaninInternal(
+    MutableNodeView* node_view, const SafeTensorId& fanin_id) {
+  MutableNodeView* fanin_node_view = GetNode(fanin_id.node());
+  // Resize fanouts to include new output port index.
+  if (fanin_node_view->regular_fanouts_by_port_.size() < fanin_id.index() + 1) {
+    fanin_node_view->regular_fanouts_by_port_.resize(fanin_id.index() + 1);
+  }
+
+  // Add node as fanout to fanin.
+  auto& fanouts = fanin_node_view->regular_fanouts_by_port_[fanin_id.index()];
+  fanouts.emplace_back(this, node_view->node_index(),
+                       node_view->regular_fanins_.size(),
+                       node_view->regular_fanins_.size());
+  ++fanin_node_view->num_regular_fanouts_;
+
+  // Add fanin to node.
+  node_view->regular_fanins_.emplace_back(this, fanin_node_view->node_index(),
+                                          fanin_id.index(), fanouts.size() - 1);
+  IncrementFaninCount(
+      &node_view->fanins_count_,
+      {&graph_->node(fanin_node_view->node_index()), fanin_id.index()});
+}
+
+inline void MutableGraphView::UpdateRegularFaninInternal(
+    MutableNodeView* node_view, const int i, const SafeTensorId& fanin_id) {
+  // Remove fanin.
+  RemoveRegularFaninFanoutInternal(node_view, i);
+
+  MutableNodeView* fanin_node_view = GetNode(fanin_id.node());
+  // Resize fanouts to include new output port index.
+  if (fanin_node_view->regular_fanouts_by_port_.size() < fanin_id.index() + 1) {
+    fanin_node_view->regular_fanouts_by_port_.resize(fanin_id.index() + 1);
+  }
+
+  // Add node as fanout to fanin.
+  auto& fanouts = fanin_node_view->regular_fanouts_by_port_[fanin_id.index()];
+  fanouts.emplace_back(this, node_view->node_index(), i, i);
+  ++fanin_node_view->num_regular_fanouts_;
+
+  // Replace fanin in node.
+  node_view->regular_fanins_[i] =
+      MutableFanoutView(this, fanin_node_view->node_index(), fanin_id.index(),
+                        fanouts.size() - 1);
+  IncrementFaninCount(
+      &node_view->fanins_count_,
+      {&graph_->node(fanin_node_view->node_index()), fanin_id.index()});
+}
+
+inline void MutableGraphView::RemoveControllingFaninFanoutInternal(
+    MutableNodeView* node_view, int i) {
+  auto& control_to_remove = node_view->controlling_fanins_[i];
+  if (control_to_remove.fanout_index_ != internal::kMissingIndex) {
+    // Update internal state associated with node.
+    node_view->fanins_count_.erase(
+        {control_to_remove.node_view()->node(), Graph::kControlSlot});
+    node_view->controlling_fanins_index_.erase(
+        control_to_remove.node_view()->GetName());
+
+    // Remove controlled fanout from controlling fanin, via swapping last
+    // controlled fanout in controlling fanin with controlled fanout to be
+    // removed.
+    auto* control_to_remove_view = control_to_remove.node_view();
+    if (control_to_remove.fanout_index_ <
+        control_to_remove_view->controlled_fanouts_.size() - 1) {
+      auto& control_to_remove_view_last_control =
+          control_to_remove_view->controlled_fanouts_.back();
+      control_to_remove_view_last_control.node_view()
+          ->controlling_fanins_[control_to_remove_view_last_control
+                                    .fanin_index_]
+          .fanout_index_ = control_to_remove.fanout_index_;
+      std::swap(control_to_remove_view_last_control,
+                control_to_remove_view
+                    ->controlled_fanouts_[control_to_remove.fanout_index_]);
+    }
+    control_to_remove_view->controlled_fanouts_.pop_back();
+  }
+}
+
+inline void MutableGraphView::RemoveControllingFaninInternal(
+    MutableNodeView* node_view, const std::set<int>& indices_to_remove) {
+  const int num_regular_fanins = node_view->NumRegularFanins();
+  auto* mutable_input = node_view->node()->mutable_input();
+  // Iterate in descending order so indices stay consistent.
+  for (auto rit = indices_to_remove.rbegin(); rit != indices_to_remove.rend();
+       ++rit) {
+    const int control_index = *rit;
+    RemoveControllingFaninFanoutInternal(node_view, control_index);
+
+    // Swap last controlling fanin in node with controlling fanin to be removed.
+    if (control_index < node_view->controlling_fanins_.size() - 1) {
+      auto& last_control = node_view->controlling_fanins_.back();
+      auto* last_control_view = last_control.node_view();
+      last_control_view->controlled_fanouts_[last_control.fanout_index_]
+          .fanin_index_ = control_index;
+      node_view->controlling_fanins_index_.find(last_control_view->GetName())
+          ->second = control_index;
+      mutable_input->SwapElements(
+          num_regular_fanins + control_index,
+          num_regular_fanins + node_view->NumControllingFanins() - 1);
+      std::swap(last_control, node_view->controlling_fanins_[control_index]);
+    }
+    mutable_input->RemoveLast();
+    node_view->controlling_fanins_.pop_back();
+  }
+}
+
+inline void MutableGraphView::AddControllingFaninInternal(
+    MutableNodeView* node_view, absl::string_view fanin_node_name) {
+  NodeDef* node = node_view->node();
+  // Add controlling fanin to NodeDef.
+  node->add_input(AsControlDependency(string(fanin_node_name)));
+  MutableNodeView* fanin_node_view = GetNode(fanin_node_name);
+  const int index = node_view->controlling_fanins_.size();
+  fanin_node_view->controlled_fanouts_.emplace_back(
+      this, node_view->node_index(), Graph::kControlSlot, index);
+  node_view->controlling_fanins_.emplace_back(
+      this, fanin_node_view->node_index(), Graph::kControlSlot,
+      fanin_node_view->controlled_fanouts_.size() - 1);
+  IncrementFaninCount(
+      &node_view->fanins_count_,
+      {&graph_->node(fanin_node_view->node_index()), Graph::kControlSlot});
+  // Parse new fanin string for node name.
+  TensorId tensor_id = ParseTensorName(node->input(node->input_size() - 1));
+  node_view->controlling_fanins_index_.emplace(tensor_id.node(), index);
+}
+
+void MutableGraphView::ApplyNodeUpdates() {
+  for (auto& diff : mutation_.updated_nodes_) {
+    if (diff.removed || diff.node_index == internal::kMissingIndex ||
+        internal::IsEmpty(&diff)) {
+      continue;
+    }
+    MutableNodeView& node_view = nodes_[diff.node_index];
+    diff.node_index = internal::kMissingIndex;
+    // Clean up node view.
+    node_view.update_index_ = internal::kMissingIndex;
+
+    NodeDef* node_def = node_view.node();
+
+    // Set updated fields and attributes of node.
+    if (diff.update_op) {
+      node_def->set_op(diff.op);
+    }
+    if (diff.update_device) {
+      node_def->set_device(diff.device);
+    }
+    node_def->mutable_attr()->swap(diff.processed_attrs);
+
+    // Updated fanins. Only one of `regular_inputs_to_remove_` or
+    // `regular_inputs_to_add_` can be set.
+    if (diff.num_regular_inputs_to_remove > 0) {
+      // Truncate trailing regular fanins.
+      const int first_index =
+          node_view.NumRegularFanins() - diff.num_regular_inputs_to_remove;
+      for (int i = first_index; i < node_view.NumRegularFanins(); ++i) {
+        RemoveRegularFaninFanoutInternal(&node_view, i);
+      }
+      node_view.regular_fanins_.resize(first_index);
+      node_def->mutable_input()->DeleteSubrange(
+          node_view.NumRegularFanins(), diff.num_regular_inputs_to_remove);
+    } else if (diff.num_regular_inputs_to_add > 0) {
+      // Append regular fanins.
+      node_def->mutable_input()->Reserve(node_def->mutable_input()->size() +
+                                         diff.num_regular_inputs_to_add);
+      int curr_index = node_view.NumRegularFanins();
+      int curr_control_start = curr_index;
+      for (const SafeTensorId& fanin : diff.regular_inputs_to_add) {
+        AddRegularFaninInternal(&node_view, fanin);
+        node_def->add_input(SafeTensorIdToString(fanin));
+        node_def->mutable_input()->SwapElements(curr_index,
+                                                node_def->input_size() - 1);
+        if (curr_control_start == curr_index) {
+          curr_control_start = node_def->input_size() - 1;
+        }
+        ++curr_index;
+      }
+      // Rotate shifted controlling fanins to match up with
+      // `node_view.controlling_fanins_` as `num_regular_inputs_to_add_` may not
+      // be a multiple of `num_regular_inputs_to_add_`. This is to prevent
+      // rehashing controlling fanins in `node_view.controlling_fanins_index_`.
+      if (node_view.NumControllingFanins() > 1 &&
+          curr_control_start != node_view.NumRegularFanins()) {
+        std::rotate(
+            node_def->mutable_input()->begin() + node_view.NumRegularFanins(),
+            node_def->mutable_input()->begin() + curr_control_start,
+            node_def->mutable_input()->end());
+      }
+    }
+
+    for (const auto& update_fanin : diff.regular_inputs_to_update) {
+      UpdateRegularFaninInternal(&node_view, update_fanin.first,
+                                 update_fanin.second);
+      node_def->set_input(update_fanin.first,
+                          SafeTensorIdToString(update_fanin.second));
+    }
+
+    RemoveControllingFaninInternal(&node_view,
+                                   diff.controlling_inputs_to_remove);
+
+    node_def->mutable_input()->Reserve(node_def->mutable_input()->size() +
+                                       diff.controlling_inputs_to_add.size());
+    for (const auto& control_to_add : diff.controlling_inputs_to_add) {
+      AddControllingFaninInternal(&node_view, control_to_add);
+    }
+  }
+}
+
+void MutableGraphView::SetNewNodesFanins(
+    const std::vector<int>& new_node_indices) {
+  auto new_node = mutation_.new_nodes_.begin();
+  for (const int new_node_index : new_node_indices) {
+    MutableNodeView& new_node_view = nodes_[new_node_index];
+    NodeDef* new_node_def = new_node_view.node();
+    new_node_def->mutable_input()->Reserve(new_node->num_regular_fanins +
+                                           new_node->controlling_fanins.size());
+    for (const SafeTensorId& fanin : new_node->regular_fanins) {
+      AddRegularFaninInternal(&new_node_view, fanin);
+      new_node_def->add_input(SafeTensorIdToString(fanin));
+    }
+    for (const string& control_to_add : new_node->controlling_fanins) {
+      AddControllingFaninInternal(&new_node_view, control_to_add);
+    }
+    ++new_node;
+  }
+}
+
+inline void MutableGraphView::RemoveAllFaninFanoutInternal(
+    MutableNodeView* node_view) {
+  const int num_regular_fanins = node_view->NumRegularFanins();
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    RemoveRegularFaninFanoutInternal(node_view, i);
+  }
+  std::vector<MutableFanoutView>().swap(node_view->regular_fanins_);
+  const int num_controlling_fanins = node_view->NumControllingFanins();
+  for (int i = 0; i < num_controlling_fanins; ++i) {
+    RemoveControllingFaninFanoutInternal(node_view, i);
+  }
+  std::vector<MutableFanoutView>().swap(node_view->controlling_fanins_);
+}
+
+void MutableGraphView::RemoveNodesInternal(
+    const std::vector<RenamedOrOverwrittenNode>& renamed_nodes,
+    const std::vector<bool>& overwritten_name_removed_nodes) {
+  // Get all nodes overwritten by renamed nodes and remove their fanins.
+  std::vector<int> overwritten_nodes;
+  overwritten_nodes.reserve(renamed_nodes.size());
+  for (const auto& renamed : renamed_nodes) {
+    if (renamed.overwritten_node_index_ != internal::kMissingIndex) {
+      auto& node = nodes_[renamed.overwritten_node_index_];
+      RemoveAllFaninFanoutInternal(&node);
+      overwritten_nodes.emplace_back(renamed.overwritten_node_index_);
+    }
+  }
+
+  // Get all nodes explicitly marked for removal and remove their fanins.
+  std::vector<int> node_indices_to_remove;
+  node_indices_to_remove.reserve(mutation_.updated_nodes_.size() +
+                                 overwritten_nodes.size());
+  for (int i = 0; i < mutation_.updated_nodes_.size(); ++i) {
+    const auto& diff = mutation_.updated_nodes_[i];
+    if (diff.removed) {
+      auto& node = nodes_[diff.node_index];
+      RemoveAllFaninFanoutInternal(&node);
+      node_indices_to_remove.push_back(diff.node_index);
+      if (!overwritten_name_removed_nodes[i]) {
+        node_index_by_name_.erase(node.GetName());
+      }
+    }
+  }
+  node_indices_to_remove.insert(node_indices_to_remove.end(),
+                                overwritten_nodes.begin(),
+                                overwritten_nodes.end());
+  std::set<int> sorted_node_indices_to_remove(node_indices_to_remove.begin(),
+                                              node_indices_to_remove.end());
+
+  // Iterate in descending order so indices stay consistent.
+  for (auto rit = sorted_node_indices_to_remove.rbegin();
+       rit != sorted_node_indices_to_remove.rend(); ++rit) {
+    const int removed_node_index = *rit;
+    MutableNodeView& last_node = nodes_.back();
+    if (last_node.node_index_ > removed_node_index) {
+      last_node.node_index_ = removed_node_index;
+      for (auto& regular_fanin : last_node.regular_fanins_) {
+        // Update fanouts of regular fanins with new index.
+        regular_fanin.node_view()
+            ->regular_fanouts_by_port_[regular_fanin.index()]
+                                      [regular_fanin.fanout_index_]
+            .node_index_ = removed_node_index;
+      }
+      for (auto& controlling_fanin : last_node.controlling_fanins_) {
+        // Update fanouts of controlling fanins with new index.
+        controlling_fanin.node_view()
+            ->controlled_fanouts_[controlling_fanin.fanout_index_]
+            .node_index_ = removed_node_index;
+      }
+      for (auto& regular_fanouts : last_node.regular_fanouts_by_port_) {
+        for (auto& regular_fanout : regular_fanouts) {
+          // Update fanins of regular fanouts.
+          MutableNodeView* fanout_node_view = regular_fanout.node_view();
+          fanout_node_view->regular_fanins_[regular_fanout.fanin_index_]
+              .node_index_ = removed_node_index;
+        }
+      }
+      for (auto& controlled_fanout : last_node.controlled_fanouts_) {
+        // Update fanins of controlled fanouts.
+        MutableNodeView* fanout_node_view = controlled_fanout.node_view();
+        fanout_node_view->controlling_fanins_[controlled_fanout.fanin_index_]
+            .node_index_ = removed_node_index;
+      }
+
+      const int last_node_index = nodes_.size() - 1;
+      std::swap(nodes_[last_node_index], nodes_[removed_node_index]);
+      graph()->mutable_node()->SwapElements(last_node_index,
+                                            removed_node_index);
+      node_index_by_name_.find(nodes_[removed_node_index].GetName())->second =
+          removed_node_index;
+    }
+    nodes_.pop_back();
+    graph()->mutable_node()->RemoveLast();
+  }
+}
+
+namespace {
+constexpr int kTopologicalSortDone = -1;
+
+const char kMutableGraphViewSortTopologicallyError[] =
+    "MutableGraphView::SortTopologically error: ";
+
+// TraversalState is an enum representing the state of a node when it is being
+// traversed via DFS.
+enum TraversalState : uint8_t { NOT_VISITED, PENDING, PROCESSING, PROCESSED };
+
+// RecursionStackState is an enum representing the recursion stack state
+// when using DFS iteratively. `ENTER` is the state representing entering into
+// a recursive call, while `EXIT` is the state representing exiting a
+// recursive call.
+enum RecursionStackState : bool { ENTER, EXIT };
+
+// RecursionStackEntry is a helper struct representing an instance of a
+// recursive call in the iterative DFS simulating a recursive ordering.
+struct RecursionStackEntry {
+  RecursionStackEntry(int node_index, RecursionStackState recursion_state)
+      : node_index(node_index), recursion_state(recursion_state) {}
+
+  const int node_index;
+  const RecursionStackState recursion_state;
+};
+
+// Edge is a helper struct representing an edge in the graph.
+struct Edge {
+  Edge(int from, int to) : from(from), to(to) {}
+
+  const int from;
+  const int to;
+};
+}  // namespace
+
+Status MutableGraphView::SortTopologically(
+    bool ignore_cycles,
+    absl::Span<const TopologicalDependency> extra_dependencies) {
+  if (!mutation_.updated_nodes_.empty() || !mutation_.new_nodes_.empty()) {
+    // Cannot sort when there is an active mutation due to indices possibly
+    // being changed or invalidated.
+    return errors::InvalidArgument(kMutableGraphViewSortTopologicallyError,
+                                   "active mutation exists.");
+  }
+
+  const int num_nodes = nodes_.size();
+
+  // Group extra dependencies by `from` node.
+  absl::flat_hash_map<int, std::vector<int>> extra_dependencies_by_parent;
+  for (const auto& extra_dependency : extra_dependencies) {
+    if (extra_dependency.graph_view_ != this ||
+        extra_dependency.from_ == extra_dependency.to_ ||
+        extra_dependency.from_ < 0 || extra_dependency.from_ >= num_nodes ||
+        extra_dependency.to_ < 0 || extra_dependency.to_ >= num_nodes) {
+      return errors::InvalidArgument(kMutableGraphViewSortTopologicallyError,
+                                     "invalid extra dependencies.");
+    }
+    extra_dependencies_by_parent[extra_dependency.from_].push_back(
+        extra_dependency.to_);
+  }
+
+  // Reversed colored post-order DFS traversal. This does not fail on cycles,
+  // but there are no guarantees on ordering within a cycle.
+  std::vector<TraversalState> traversal_state(num_nodes, NOT_VISITED);
+  int curr_pos = num_nodes - 1;
+  std::vector<int> order(num_nodes);
+  std::vector<Edge> edges_in_cycle;
+
+  auto push_onto_stack = [this](
+                             const int curr_index, const int fanout_index,
+                             std::vector<RecursionStackEntry>* recursion_stack,
+                             std::vector<TraversalState>* traversal_state,
+                             std::vector<Edge>* edges_in_cycle) {
+    auto& fanout_traversal_state = (*traversal_state)[fanout_index];
+    if (fanout_traversal_state == PROCESSING) {
+      // Ignore NextIteration -> Merge cycles.
+      if (!IsNextIteration(graph_->node(curr_index)) ||
+          !IsMerge(graph_->node(fanout_index))) {
+        // Cycle detected.
+        edges_in_cycle->push_back({curr_index, fanout_index});
+      }
+    } else if (fanout_traversal_state == NOT_VISITED) {
+      // Unvisited node, simply add to stack for future traversal.
+      fanout_traversal_state = PENDING;
+      recursion_stack->push_back({fanout_index, ENTER});
+    }
+  };
+
+  auto process_fanouts = [this, &extra_dependencies_by_parent,
+                          &push_onto_stack](
+                             const int curr_index,
+                             std::vector<RecursionStackEntry>* recursion_stack,
+                             std::vector<TraversalState>* traversal_state,
+                             std::vector<Edge>* edges_in_cycle) {
+    const auto& node_view = nodes_[curr_index];
+    // Regular fanouts.
+    for (const auto& regular_fanouts_port_i : node_view.GetRegularFanouts()) {
+      for (const auto& regular_fanout : regular_fanouts_port_i) {
+        push_onto_stack(curr_index, regular_fanout.node_index_, recursion_stack,
+                        traversal_state, edges_in_cycle);
+      }
+    }
+    // Controlled fanouts.
+    for (const auto& controlled_fanout : node_view.GetControlledFanouts()) {
+      push_onto_stack(curr_index, controlled_fanout.node_index_,
+                      recursion_stack, traversal_state, edges_in_cycle);
+    }
+    // Extra dependencies.
+    auto it = extra_dependencies_by_parent.find(curr_index);
+    if (it != extra_dependencies_by_parent.end()) {
+      for (const auto& extra_fanout : it->second) {
+        push_onto_stack(curr_index, extra_fanout, recursion_stack,
+                        traversal_state, edges_in_cycle);
+      }
+    }
+  };
+
+  auto reversed_postorder_dfs =
+      [&process_fanouts](const MutableNodeView& root_node_view,
+                         std::vector<int>* order,
+                         std::vector<TraversalState>* traversal_state,
+                         int* curr_pos, std::vector<Edge>* edges_in_cycle) {
+        std::vector<RecursionStackEntry> recursion_stack;
+        // Add the root to stack to start the traversal.
+        const int root_index = root_node_view.node_index_;
+        auto& root_traversal_state = (*traversal_state)[root_index];
+        if (root_traversal_state == NOT_VISITED) {
+          root_traversal_state = PENDING;
+          recursion_stack.push_back({root_index, ENTER});
+        }
+        while (!recursion_stack.empty()) {
+          auto curr_pair = recursion_stack.back();
+          recursion_stack.pop_back();
+          const int curr_index = curr_pair.node_index;
+          auto& curr_traversal_state = (*traversal_state)[curr_index];
+          if (curr_traversal_state == PROCESSED) {
+            // Node already processed which can be ignored.
+            continue;
+          } else if (curr_pair.recursion_state == EXIT) {
+            // Node from recursion stack where all fanouts were visited.
+            // Instead of adding node index to a vector, simply set what its
+            // index would be, so there will not be a need for inversion later
+            // on. The value set is in decending order so the reversed
+            // post-order is returned.
+            (*order)[curr_index] = *curr_pos;
+            curr_traversal_state = PROCESSED;
+            --(*curr_pos);
+          } else {
+            // Process current node and fanouts.
+            curr_traversal_state = PROCESSING;
+            recursion_stack.push_back({curr_index, EXIT});
+            process_fanouts(curr_index, &recursion_stack, traversal_state,
+                            edges_in_cycle);
+          }
+        }
+      };
+
+  // Determine sources to start DFS (nodes with no inputs) and unique fanout
+  // nodes.
+  for (const auto& node : nodes_) {
+    if (node.NumRegularFanins() + node.NumControllingFanins() == 0) {
+      reversed_postorder_dfs(node, &order, &traversal_state, &curr_pos,
+                             &edges_in_cycle);
+    }
+  }
+
+  if (!ignore_cycles && !edges_in_cycle.empty()) {
+    std::vector<string> edges_formatted;
+    edges_formatted.reserve(edges_in_cycle.size());
+    for (const auto& edge : edges_in_cycle) {
+      edges_formatted.push_back(
+          absl::StrCat("'", graph_->node(edge.from).name(), "' -> '",
+                       graph_->node(edge.to).name(), "'"));
+    }
+    const string edges_str =
+        absl::StrCat("{", absl::StrJoin(edges_formatted, ", "), "}");
+    return errors::InvalidArgument(kMutableGraphViewSortTopologicallyError,
+                                   "detected edge(s) creating cycle(s) ",
+                                   edges_str, ".");
+  }
+  if (curr_pos != kTopologicalSortDone) {
+    // Not all nodes were processed.
+    if (!ignore_cycles) {
+      return errors::InvalidArgument(
+          kMutableGraphViewSortTopologicallyError,
+          "was not able to sort all nodes topologically.");
+    }
+    // Otherwise process all nodes regardless of cycles.
+    for (const auto& node : nodes_) {
+      reversed_postorder_dfs(node, &order, &traversal_state, &curr_pos,
+                             &edges_in_cycle);
+    }
+  }
+
+  // Permute nodes by reversed post-order DFS.
+  std::vector<MutableNodeView> permuted_nodes(num_nodes);
+  for (int i = 0; i < num_nodes; ++i) {
+    permuted_nodes[order[i]] = std::move(nodes_[i]);
+  }
+  nodes_.swap(permuted_nodes);
+
+  // Fix up indices of MutableNodeViews.
+  for (MutableNodeView& node_view : nodes_) {
+    const int prev_node_index = node_view.node_index_;
+    if (prev_node_index != order[prev_node_index]) {
+      const string& node_name = graph_->node(prev_node_index).name();
+      node_view.node_index_ = order[prev_node_index];
+      node_index_by_name_.find(node_name)->second = node_view.node_index_;
+    }
+    for (MutableFanoutView& regular_fanin : node_view.regular_fanins_) {
+      regular_fanin.node_index_ = order[regular_fanin.node_index_];
+    }
+    for (MutableFanoutView& controlling_fanin : node_view.controlling_fanins_) {
+      controlling_fanin.node_index_ = order[controlling_fanin.node_index_];
+    }
+    for (std::vector<MutableFaninView>& regular_fanouts_port_i :
+         node_view.regular_fanouts_by_port_) {
+      for (MutableFaninView& regular_fanout : regular_fanouts_port_i) {
+        regular_fanout.node_index_ = order[regular_fanout.node_index_];
+      }
+    }
+    for (MutableFaninView& controlled_fanout : node_view.controlled_fanouts_) {
+      controlled_fanout.node_index_ = order[controlled_fanout.node_index_];
+    }
+  }
+
+  // Permute graph NodeDefs.
+  PermuteNodesInPlace(graph_, &order, /*invert_permutation=*/false);
+
+  return Status::OK();
+}
+
+inline Status MutableGraphView::ValidateInternal(
+    absl::flat_hash_map<absl::string_view, int>* node_names,
+    std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+    std::vector<int>* inplace_nodes,
+    std::vector<int>* empty_diff_node_indices) {
+  // Get node names and partition updated_nodes_ by if they are renamed or not,
+  // skipping empty MutableNodeViewDiff.
+  TF_RETURN_IF_ERROR(GetNodeNamesAndPartitionUpdatedNodes(
+      node_names, renamed_nodes, inplace_nodes, empty_diff_node_indices));
+
+  // Check existence of fanins and validity (i.e. no self loops).
+  TF_RETURN_IF_ERROR(
+      CheckNodeNamesAndFanins(*node_names, *renamed_nodes, *inplace_nodes));
+
+  // Check if nodes after mutation have kernels registered.
+  TF_RETURN_IF_ERROR(CheckKernelRegisteredForNodes());
+
+  return Status::OK();
+}
+
+Status MutableGraphView::ApplyMutationInternal() {
+  // Node name -> node index mapping. If a node index is -1, the associated node
+  // with key node name exists. Otherwise the node index is the node's index in
+  // the graph.
+  absl::flat_hash_map<absl::string_view, int> node_names;
+  // Indices of MutableNodeViewDiff in Mutation::updated_nodes_ where nodes are
+  // renamed (and possibly have other fields mutated).
+  std::vector<RenamedOrOverwrittenNode> renamed_nodes;
+  // Indices of MutableNodeViewDiff in Mutation::updated_nodes_ where nodes are
+  // not renamed but have fields mutated.
+  std::vector<int> inplace_nodes;
+  // Indices of nodes in graph where MutableNodeViewDiff are empty.
+  // `update_index_` of nodes associated to empty MutableNodeViewDiff should be
+  // cleared after validation success.
+  std::vector<int> empty_diff_node_indices;
+
+  // Check if this mutation is valid before applying, and partition
+  // updated_nodes_ into inplace mutated nodes and renamed nodes.
+  TF_RETURN_IF_ERROR(ValidateInternal(
+      &node_names, &renamed_nodes, &inplace_nodes, &empty_diff_node_indices));
+
+  // Clear `update_index_` of MutableNodeView with empty associated
+  // MutableNodeViewDiff.
+  for (const int empty_diff_node_index : empty_diff_node_indices) {
+    nodes_[empty_diff_node_index].update_index_ = internal::kMissingIndex;
+  }
+
+  // Node name and associated fanouts.
+  absl::flat_hash_map<string, NodeViewFanouts> renamed_fanouts;
+  // Removed nodes where name was overwritten by a renamed node.
+  std::vector<bool> overwritten_name_removed_nodes(
+      mutation_.updated_nodes_.size());
+  // Fix renaming of existing nodes by swapping fanouts and rehashing names.
+  // This will also overwrite removed or unmodified nodes.
+  FixRenamedNodes(&renamed_nodes, &renamed_fanouts,
+                  &overwritten_name_removed_nodes);
+
+  // Indices of nodes in graph where new nodes were inserted/appended. These
+  // will be corresponding to `new_nodes_` in order.
+  std::vector<int> new_node_indices;
+  // Add new nodes, overwriting removed or unmodified nodes.
+  AddNewNodes(&renamed_fanouts, &new_node_indices);
+
+  // For abandoned fanouts, mark their respective fanins so the original node
+  // associated will not have their fanouts removed and be left in an
+  // inconsistent state.
+  FixRenamedFanouts(renamed_fanouts);
+
+  // Apply mutations to updated nodes (renamed nodes are treated as inplace
+  // nodes as they have already been renamed). Removed nodes are ignored.
+  ApplyNodeUpdates();
+
+  // Set fanins of new nodes.
+  SetNewNodesFanins(new_node_indices);
+
+  // Remove overwritten nodes and updated nodes set to be removed.
+  RemoveNodesInternal(renamed_nodes, overwritten_name_removed_nodes);
+
+  mutation_.ResetInternal();
+
+  mutation_.mutation_counter_++;
+
+  return Status::OK();
+}
+
+}  // namespace utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/graph_view.h b/tensorflow/core/grappler/utils/graph_view.h
new file mode 100644
index 00000000000..9f61e811169
--- /dev/null
+++ b/tensorflow/core/grappler/utils/graph_view.h
@@ -0,0 +1,533 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/utils/graph_view_internal.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace utils {
+
+class NodeView;
+
+class GraphView;
+
+// FaninView is a helper class to represent fanouts of a node. This holds a
+// pointer to GraphView, the index of the node being represented from GraphView,
+// and the input index (hence is labeled as Fanin).
+class FaninView : public internal::NodeIndexAndPortIndex<NodeView, GraphView> {
+ public:
+  FaninView() : NodeIndexAndPortIndex() {}
+
+  FaninView(GraphView* graph_view, int node_index, int port_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index) {}
+
+  FaninView(NodeView* node_view, int index);
+
+ private:
+  friend class NodeView;
+  friend class GraphView;
+};
+
+// FanoutView is a helper class to represent fanins of a node. This holds a
+// pointer to GraphView, the index of the node being represented from GraphView,
+// and the output index (hence is labeled as Fanout).
+class FanoutView : public internal::NodeIndexAndPortIndex<NodeView, GraphView> {
+ public:
+  FanoutView() : NodeIndexAndPortIndex() {}
+
+  FanoutView(GraphView* graph_view, int node_index, int port_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index) {}
+
+  FanoutView(NodeView* node_view, int index);
+
+ private:
+  friend class NodeView;
+  friend class GraphView;
+};
+
+// Immutable NodeView that keeps the constness of the NodeDef. This allows for
+// lookups of fanins and fanouts, and traversals of the graph, but no mutations.
+// No dedupping of fanins will be performed on the node to preserve it's
+// constness.
+class NodeView : public internal::NodeViewInternal<FaninView, FanoutView,
+                                                   GraphView, true> {
+ public:
+  explicit NodeView(GraphView* graph_view, int node_index)
+      : NodeViewInternal(graph_view, node_index) {}
+
+  NodeView() : NodeViewInternal() {}
+
+  ~NodeView() override = default;
+
+  NodeView(NodeView&&) = default;
+  NodeView& operator=(NodeView&&) = default;
+
+  const NodeDef* node() const override;
+
+  // Checks if a fanin exists for the node.
+  bool HasFanin(const FanoutView& fanin) const override;
+
+  // Checks if a fanout exists for the node.
+  bool HasFanout(const FaninView& fanout) const override;
+
+ private:
+  inline const FanoutView& GetMissingFanin() const override;
+
+  inline const std::vector<FaninView>& GetMissingFanout() const override;
+
+  absl::flat_hash_set<internal::NodeDefAndPortIndex> fanins_set_;
+
+  friend class FaninView;
+  friend class FanoutView;
+  friend class GraphView;
+};
+
+// Immutable GraphView that keeps the constness of the GraphDef. This allows
+// for lookups and traversals of the graph, but no mutations.
+class GraphView : public internal::GraphViewInternal<NodeView, FaninView,
+                                                     FanoutView, true> {
+ public:
+  explicit GraphView(const GraphDef* graph, Status* status);
+  ~GraphView() override = default;
+
+ private:
+  bool AddUniqueNodeInternal(const NodeDef* node);
+
+  Status CheckAndAddFaninsInternal(NodeView* node_view);
+
+  friend class NodeView;
+};
+
+class MutableNodeView;
+
+class MutableGraphView;
+
+class Mutation;
+
+// MutableFaninView is a helper class to represent fanouts of a node. This holds
+// a pointer to MutableGraphView, the index of the node from MutableGraphView
+// being mutated, and the input index (hence is labeled as Fanin).
+class MutableFaninView
+    : public internal::NodeIndexAndPortIndex<MutableNodeView,
+                                             MutableGraphView> {
+ public:
+  MutableFaninView() : NodeIndexAndPortIndex() {}
+
+  MutableFaninView(MutableGraphView* graph_view, int node_index, int port_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index) {}
+
+  explicit MutableFaninView(MutableGraphView* graph_view, int node_index,
+                            int port_index, int fanin_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index),
+        fanin_index_(fanin_index) {
+    // TODO(lyandy): Remove once constructor is not public.
+    DCHECK(port_index < 0 || port_index == fanin_index);
+  }
+
+  MutableFaninView(MutableNodeView* node_view, int index);
+
+ private:
+  // Index of associated fanin in fanout's underlying MutableNodeView. For
+  // regular fanouts, this will be the same as port_index (index of the
+  // associated fanin in MutableNodeView::regular_fanins_). For controlled
+  // fanouts, this will be the index of the associated fanin in
+  // MutableNodeView::controlling_fanins_.
+  int fanin_index_ = internal::kMissingIndex;
+
+  friend class MutableNodeView;
+  friend class MutableGraphView;
+  friend class Mutation;
+};
+
+// MutableFanoutView is a helper class to represent fanins of a node. This holds
+// a pointer to MutableGraphView, the index of the node from MutableGraphView
+// being mutated, and the output index (hence is labeled as Fanout).
+class MutableFanoutView
+    : public internal::NodeIndexAndPortIndex<MutableNodeView,
+                                             MutableGraphView> {
+ public:
+  MutableFanoutView() : NodeIndexAndPortIndex() {}
+
+  MutableFanoutView(MutableGraphView* graph_view, int node_index,
+                    int port_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index) {}
+
+  explicit MutableFanoutView(MutableGraphView* graph_view, int node_index,
+                             int port_index, int fanout_index)
+      : NodeIndexAndPortIndex(graph_view, node_index, port_index),
+        fanout_index_(fanout_index) {}
+
+  MutableFanoutView(MutableNodeView* node_view, int index);
+
+ private:
+  // Index of associated fanout in fanin's underlying MutableNodeView. For
+  // regular fanins, this will be the index of the associated fanout in
+  // MutableNodeView::regular_fanouts_by_port_[port_index]. For controlled
+  // fanins, this will be the index of the associated fanout in
+  // MutableNodeView::controlled_fanouts_.
+  int fanout_index_ = internal::kMissingIndex;
+
+  friend class MutableNodeView;
+  friend class MutableGraphView;
+  friend class Mutation;
+};
+
+// Mutable NodeView that holds a mutable NodeDef. This allows for lookups of
+// fanins and fanouts, and traversals of the graph. Control dependencies will be
+// dedupped among other control dependencies on initialization via
+// MutableGraphView. Mutations should be handled via MutableGraphView and not
+// directly on the mutable NodeDef.
+class MutableNodeView
+    : public internal::NodeViewInternal<MutableFaninView, MutableFanoutView,
+                                        MutableGraphView, false> {
+ public:
+  explicit MutableNodeView(MutableGraphView* graph_view, int node_index)
+      : NodeViewInternal(graph_view, node_index) {}
+
+  MutableNodeView() : NodeViewInternal() {}
+
+  ~MutableNodeView() override = default;
+
+  MutableNodeView(MutableNodeView&&) = default;
+  MutableNodeView& operator=(MutableNodeView&&) = default;
+
+  NodeDef* node() const override;
+
+  // Checks if a fanin exists for the node.
+  bool HasFanin(const MutableFanoutView& fanin) const override;
+
+  // Checks if a fanout exists for the node.
+  bool HasFanout(const MutableFaninView& fanout) const override;
+
+ private:
+  inline const MutableFanoutView& GetMissingFanin() const override;
+
+  inline const std::vector<MutableFaninView>& GetMissingFanout() const override;
+
+  absl::flat_hash_map<internal::NodeDefAndPortIndex, int> fanins_count_;
+  absl::flat_hash_map<absl::string_view, int> controlling_fanins_index_;
+  // Index of associated MutableNodeViewDiff in Mutation::updated_nodes_.
+  // If this is -1, there exists no MutableNodeViewDiff for this node.
+  int update_index_ = internal::kMissingIndex;
+
+  friend class MutableFaninView;
+  friend class MutableFanoutView;
+  friend class MutableGraphView;
+  friend class Mutation;
+};
+
+class MutationNewNode {
+ private:
+  explicit MutationNewNode(Mutation* mutation, int mutation_counter, int index)
+      : mutation_(mutation),
+        mutation_counter_(mutation_counter),
+        index_(index) {}
+
+  const Mutation* mutation_;
+  const int mutation_counter_;
+  const int index_;
+
+  friend class Mutation;
+};
+
+// Mutation is a helper class that allows rewrites of MutableGraphView. This
+// should not be initialized or be used directly.
+// Note, if a node is renamed to another node, or a new node is created with the
+// same name as an existing node, the node with the same name originally in the
+// graph will be overwritten.
+class Mutation {
+ public:
+  // Create a new node to be added to the graph. If the node's fanins are not
+  // well formed (self loops, control dependencies between regular fanins), the
+  // `status` will be set.
+  MutationNewNode AddNode(NodeDef&& node, Status* status);
+
+  // Remove an existing node in the graph.
+  void RemoveNode(MutableNodeView* node);
+
+  // Update the name of an existing node.
+  void UpdateNodeName(MutableNodeView* node, absl::string_view name);
+
+  // Update the name of a new node.
+  void UpdateNodeName(const MutationNewNode& node, absl::string_view name);
+
+  // Update the op of an existing node.
+  void UpdateNodeOp(MutableNodeView* node, absl::string_view op);
+
+  // Update the op of a new node.
+  void UpdateNodeOp(const MutationNewNode& node, absl::string_view op);
+
+  // Update the device of an existing node.
+  void UpdateNodeDevice(MutableNodeView* node, absl::string_view device);
+
+  // Update the device of a new node.
+  void UpdateNodeDevice(const MutationNewNode& node, absl::string_view device);
+
+  // Add or replace regular fanin `fanin` at `index` for an existing node.
+  void AddOrUpdateRegularFanin(MutableNodeView* node, int index,
+                               const TensorId& fanin);
+
+  // Add or replace regular fanin `fanin` at `index` for a new node.
+  void AddOrUpdateRegularFanin(const MutationNewNode& node, int index,
+                               const TensorId& fanin);
+
+  // Remove regular fanin at `index` for an existing node.
+  void RemoveRegularFanin(MutableNodeView* node, int index);
+
+  // Remove regular fanin at `index` for a new node.
+  void RemoveRegularFanin(const MutationNewNode& node, int index);
+
+  // Add controlling fanin `fanin_node_name` for an existing node.
+  void AddControllingFanin(MutableNodeView* node,
+                           absl::string_view fanin_node_name);
+
+  // Add controlling fanin `fanin_node_name` for a new node.
+  void AddControllingFanin(const MutationNewNode& node,
+                           absl::string_view fanin_node_name);
+
+  // Remove controlling fanin `fanin_node_name` for an existing node.
+  void RemoveControllingFanin(MutableNodeView* node,
+                              absl::string_view fanin_node_name);
+
+  // Remove controlling fanin `fanin_node_name` for a new node.
+  void RemoveControllingFanin(const MutationNewNode& node,
+                              absl::string_view fanin_node_name);
+
+  // Add or replace attribute `attr_name` with `attr_value` for an existing
+  // node.
+  void AddOrUpdateNodeAttr(MutableNodeView* node, absl::string_view attr_name,
+                           const AttrValue& attr_value);
+
+  // Add or replace attribute `attr_name` with `attr_value` for a new node.
+  void AddOrUpdateNodeAttr(const MutationNewNode& node,
+                           absl::string_view attr_name,
+                           const AttrValue& attr_value);
+
+  // Remove attribute `attr_name` for an existing node.
+  void RemoveNodeAttr(MutableNodeView* node, absl::string_view attr_name);
+
+  // Remove attribute `attr_name` for a new node.
+  void RemoveNodeAttr(const MutationNewNode& node, absl::string_view attr_name);
+
+  // Reset and clear mutation.
+  void Reset();
+
+  // Applies the Mutation to the graph. If the mutation is valid, the graph will
+  // be modified. Otherwise an error status will be returned and the graph will
+  // not be modified.
+  Status Apply();
+
+ private:
+  explicit Mutation(MutableGraphView* graph_view);
+
+  void ResetInternal();
+
+  using MutableNodeViewDiff = internal::NodeViewDiff<MutableGraphView>;
+  void AddMutation(MutableNodeView* node,
+                   std::function<void(MutableNodeViewDiff*)> mutate_fn);
+
+  MutableGraphView* graph_view_ = nullptr;
+  int mutation_counter_ = 0;
+  std::vector<MutableNodeViewDiff> updated_nodes_;
+
+  using MutationNewNodeHolder = internal::NewNode<MutableGraphView>;
+  std::vector<MutationNewNodeHolder> new_nodes_;
+
+  friend class MutableGraphView;
+};
+
+// Mutable GraphView that holds a mutable GraphDef. This allows for lookups and
+// traversals of the graph. Control dependencies will be dedupped among other
+// control dependencies on initialization. Mutations should be handled using
+// this API instead of directly on the GraphDef/NodeDef.
+// Note, after a mutation, pointers of MutableNodeView's from MutableGraphView
+// may be invalidated.
+class MutableGraphView
+    : public internal::GraphViewInternal<MutableNodeView, MutableFaninView,
+                                         MutableFanoutView, false> {
+ public:
+  explicit MutableGraphView(GraphDef* graph, Status* status);
+  ~MutableGraphView() override = default;
+
+  // Returns a Mutation (builder) that can be used to modify MutableGraphView.
+  Mutation* GetMutationBuilder();
+
+  // Helper class representing an extra dependency for topological sorting.
+  class TopologicalDependency {
+   public:
+    TopologicalDependency(const MutableNodeView* from_node,
+                          const MutableNodeView* to_node) {
+      if (from_node->graph_view_ == to_node->graph_view_) {
+        graph_view_ = from_node->graph_view_;
+        from_ = from_node->node_index_;
+        to_ = to_node->node_index_;
+      }
+    }
+
+   private:
+    MutableGraphView* graph_view_ = nullptr;
+    int from_ = internal::kMissingIndex;
+    int to_ = internal::kMissingIndex;
+
+    friend class MutableGraphView;
+  };
+
+  // Sorts graph topologically in-place. If `ignore_cycles` is set, a
+  // topological like sorting will be performed when there are cycles. Otherwise
+  // if a cycle is detected or if the graph cannot be sorted, an error will be
+  // returned.
+  Status SortTopologically(
+      bool ignore_cycles,
+      absl::Span<const TopologicalDependency> extra_dependencies);
+
+ private:
+  bool AddUniqueNodeInternal(NodeDef* node);
+
+  Status CheckFaninsInternal(std::vector<std::vector<TensorId>>* fanins);
+
+  void AddFaninsInternal(std::vector<std::vector<TensorId>>* fanins);
+
+  // RenamedOrOverwrittenNode holds a index to Mutation::updated_nodes_ for a
+  // renamed node, alongside a potential overwritten node index in the actual
+  // graph. If the renamed node is not overwriting any existing nodes,
+  // `overwritten_node_index_` will be set to `internal::kMissingIndex`.
+  class RenamedOrOverwrittenNode {
+   public:
+    RenamedOrOverwrittenNode(int renamed_update_index,
+                             int overwritten_node_index)
+        : renamed_update_index_(renamed_update_index),
+          overwritten_node_index_(overwritten_node_index) {}
+
+   private:
+    int renamed_update_index_;
+    int overwritten_node_index_;
+
+    friend class MutableGraphView;
+  };
+
+  Status GetNodeNamesAndPartitionUpdatedNodes(
+      absl::flat_hash_map<absl::string_view, int>* node_names,
+      std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+      std::vector<int>* inplace_nodes,
+      std::vector<int>* empty_diff_node_indices);
+
+  Status RemovedOrMissingNodeFanoutsWellFormed(
+      const absl::flat_hash_map<absl::string_view, int>& node_names,
+      const std::vector<RenamedOrOverwrittenNode>& renamed_nodes);
+
+  Status CheckNodeNamesAndFanins(
+      const absl::flat_hash_map<absl::string_view, int>& node_names,
+      const std::vector<RenamedOrOverwrittenNode>& renamed_nodes,
+      const std::vector<int>& inplace_nodes);
+
+  Status CheckKernelRegisteredForNodes();
+
+  // Helper class to move fanouts around.
+  class NodeViewFanouts {
+   public:
+    NodeViewFanouts(
+        std::vector<std::vector<MutableFaninView>>&& regular_fanouts_by_port,
+        int num_regular_fanouts,
+        std::vector<MutableFaninView> controlled_fanouts)
+        : regular_fanouts_by_port_(std::move(regular_fanouts_by_port)),
+          num_regular_fanouts_(num_regular_fanouts),
+          controlled_fanouts_(std::move(controlled_fanouts)) {}
+
+   private:
+    std::vector<std::vector<MutableFaninView>> regular_fanouts_by_port_;
+    int num_regular_fanouts_ = 0;
+    std::vector<MutableFaninView> controlled_fanouts_;
+
+    friend class MutableGraphView;
+  };
+
+  template <typename T>
+  void ReplaceNodeFanouts(MutableNodeView* node, T* fanouts);
+
+  void FixRenamedNodes(
+      std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+      absl::flat_hash_map<string, NodeViewFanouts>* renamed_fanouts,
+      std::vector<bool>* overwritten_name_removed_nodes);
+
+  void AddNewNodes(
+      absl::flat_hash_map<string, NodeViewFanouts>* renamed_fanouts,
+      std::vector<int>* new_node_indices);
+
+  void FixRenamedFanouts(
+      const absl::flat_hash_map<string, NodeViewFanouts>& renamed_fanouts);
+
+  inline void RemoveRegularFaninFanoutInternal(MutableNodeView* node_view,
+                                               int i);
+
+  inline void AddRegularFaninInternal(MutableNodeView* node_view,
+                                      const SafeTensorId& fanin_id);
+
+  inline void UpdateRegularFaninInternal(MutableNodeView* node_view,
+                                         const int i,
+                                         const SafeTensorId& fanin_id);
+
+  inline void RemoveControllingFaninFanoutInternal(MutableNodeView* node_view,
+                                                   int i);
+
+  inline void RemoveControllingFaninInternal(
+      MutableNodeView* node_view, const std::set<int>& indices_to_remove);
+
+  inline void AddControllingFaninInternal(MutableNodeView* node_view,
+                                          absl::string_view fanin_node_name);
+
+  void ApplyNodeUpdates();
+
+  void SetNewNodesFanins(const std::vector<int>& new_node_indices);
+
+  inline void RemoveAllFaninFanoutInternal(MutableNodeView* node_view);
+
+  void RemoveNodesInternal(
+      const std::vector<RenamedOrOverwrittenNode>& renamed_nodes,
+      const std::vector<bool>& overwritten_name_removed_nodes);
+
+  inline Status ValidateInternal(
+      absl::flat_hash_map<absl::string_view, int>* node_names,
+      std::vector<RenamedOrOverwrittenNode>* renamed_nodes,
+      std::vector<int>* inplace_nodes,
+      std::vector<int>* empty_diff_node_indices);
+
+  Status ApplyMutationInternal();
+
+  Mutation mutation_;
+
+  friend class MutableNodeView;
+  friend class Mutation;
+};
+
+}  // namespace utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_H_
diff --git a/tensorflow/core/grappler/utils/graph_view_internal.h b/tensorflow/core/grappler/utils/graph_view_internal.h
new file mode 100644
index 00000000000..b1756a465fe
--- /dev/null
+++ b/tensorflow/core/grappler/utils/graph_view_internal.h
@@ -0,0 +1,905 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_INTERNAL_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_INTERNAL_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace utils {
+namespace internal {
+
+constexpr int kMissingSlot = -2;
+constexpr int kMissingIndex = -1;
+constexpr int kNodeNamePresent = -1;
+
+// NodeIndexAndPortIndex is a helper class that represents fanins and fanouts
+// of a node.
+template <typename NodeViewT, typename GraphViewT>
+class NodeIndexAndPortIndex {
+ public:
+  NodeIndexAndPortIndex()
+      : graph_view_(nullptr),
+        node_index_(kMissingIndex),
+        port_index_(kMissingSlot) {}
+  NodeIndexAndPortIndex(GraphViewT* graph_view, int node_index, int port_index)
+      : graph_view_(graph_view),
+        node_index_(node_index),
+        port_index_(port_index) {}
+
+  bool operator==(const NodeIndexAndPortIndex& other) const {
+    return port_index_ == other.port_index_ &&
+           node_index_ == other.node_index_ && graph_view_ == other.graph_view_;
+  }
+
+  template <typename Hash>
+  friend Hash AbslHashValue(Hash h, const NodeIndexAndPortIndex& n) {
+    return Hash::combine(std::move(h), n.node_index_, n.port_index_);
+  }
+
+  // Returns NodeView from `graph_view_` at `node_index_`.
+  NodeViewT* node_view() const {
+    if (graph_view_ == nullptr) {
+      return nullptr;
+    }
+    return graph_view_->GetNode(node_index_);
+  }
+
+  // Returns node index in graph.
+  int node_index() const { return node_index_; }
+
+  // Returns input/output port index.
+  int index() const { return port_index_; }
+
+ protected:
+  GraphViewT* graph_view_;
+  int node_index_;
+  int port_index_;
+};
+
+// NodeDefAndPortIndex is a helper class that represents fanins hashed with
+// pointer stability using the fanin's NodeDef.
+class NodeDefAndPortIndex {
+ public:
+  NodeDefAndPortIndex(const NodeDef* node_def, int port_index)
+      : node_def_(node_def), port_index_(port_index) {}
+
+  bool operator==(const NodeDefAndPortIndex& other) const {
+    return node_def_ == other.node_def_ && port_index_ == other.port_index_;
+  }
+
+  template <typename Hash>
+  friend Hash AbslHashValue(Hash h, const NodeDefAndPortIndex& n) {
+    return Hash::combine(std::move(h), n.node_def_, n.port_index_);
+  }
+
+ private:
+  const NodeDef* node_def_;
+  int port_index_;
+};
+
+// NodeViewInternal is a helper class to simplify graph traversal. It creates
+// a view of a node and associated fanins and fanouts from the NodeDef
+// protocol buffer.
+//
+// There are two public classes implementing NodeViewInternal:
+//
+// - NodeView: constructed from `const NodeDef` and doesn't allow mutating the
+//   underlying node.
+// - MutableNodeView: constructed from `NodeDef` and allows mutating the
+//   underlying node.
+//
+// --------------------------- !!! WARNING !!! ---------------------------------
+//     Modifying the node outside of implementations of NodeViewInternal
+//     (i.e. modifying inputs of the NodeDef directly) may leave the NodeView
+//     in an inconsistent/invalid state.
+// -----------------------------------------------------------------------------
+//
+template <typename FaninViewT, typename FanoutViewT, typename GraphViewT,
+          bool IsConst>
+class NodeViewInternal {
+ private:
+  using NodeDefT =
+      typename std::conditional<IsConst, const NodeDef, NodeDef>::type;
+
+ public:
+  explicit NodeViewInternal(GraphViewT* graph_view, int node_index)
+      : graph_view_(graph_view),
+        node_index_(node_index),
+        attrs_(AttrSlice(graph_view->graph()->node(node_index))) {}
+
+  NodeViewInternal()
+      : graph_view_(nullptr), node_index_(kMissingIndex), attrs_(AttrSlice()) {}
+
+  virtual ~NodeViewInternal() {}
+
+  NodeViewInternal(NodeViewInternal&&) = default;
+  NodeViewInternal& operator=(NodeViewInternal&&) = default;
+
+  bool operator==(const NodeViewInternal& other) const {
+    return node_index_ == other.node_index_ && graph_view_ == other.graph_view_;
+  }
+
+  template <typename Hash>
+  friend Hash AbslHashValue(Hash h, const NodeViewInternal& n) {
+    return Hash::combine(std::move(h), n.node_index_);
+  }
+
+  // Returns NodeDef of view.
+  virtual NodeDefT* node() const = 0;
+
+  // Returns index of node in GraphDef/GraphView.
+  int node_index() const { return node_index_; }
+
+  // Returns the name of the node.
+  const string& GetName() const { return node()->name(); }
+
+  // Returns the op of the node.
+  const string& GetOp() const { return node()->op(); }
+
+  // Returns the device set for the node.
+  const string& GetDevice() const { return node()->device(); }
+
+  // Returns all regular fanins, based on ordering in the node.
+  const std::vector<FanoutViewT>& GetRegularFanins() const {
+    return regular_fanins_;
+  }
+
+  // Returns a regular fanin based on input index. If no such fanin exist, a
+  // missing fanin is returned, with no NodeView set and an index of -2.
+  const FanoutViewT& GetRegularFanin(int i) const {
+    if (i < 0 || i >= regular_fanins_.size()) {
+      return GetMissingFanin();
+    }
+    return regular_fanins_[i];
+  }
+
+  // Returns all controlling fanins, based on ordering in the node.
+  const std::vector<FanoutViewT>& GetControllingFanins() const {
+    return controlling_fanins_;
+  }
+
+  // Returns all regular fanouts.
+  const std::vector<std::vector<FaninViewT>>& GetRegularFanouts() const {
+    return regular_fanouts_by_port_;
+  }
+
+  // Returns a regular fanout(s) based on output index. If no such output index
+  // exists, no fanouts will be returned.
+  const std::vector<FaninViewT>& GetRegularFanout(int i) const {
+    if (i < 0 || i >= regular_fanouts_by_port_.size()) {
+      return GetMissingFanout();
+    }
+    return regular_fanouts_by_port_[i];
+  }
+
+  // Returns all controlled fanouts.
+  const std::vector<FaninViewT>& GetControlledFanouts() const {
+    return controlled_fanouts_;
+  }
+
+  // Returns the number of regular fanins.
+  int NumRegularFanins() const { return regular_fanins_.size(); }
+
+  // Returns the number of controlling fanins.
+  int NumControllingFanins() const { return controlling_fanins_.size(); }
+
+  // Returns the number of regular fanouts.
+  int NumRegularFanouts() const { return num_regular_fanouts_; }
+
+  // Returns the number of controlled fanouts.
+  int NumControlledFanouts() const { return controlled_fanouts_.size(); }
+
+  // Checks if a fanin exists for the node.
+  virtual bool HasFanin(const FanoutViewT& fanin) const = 0;
+
+  // Checks if a fanout exists for the node.
+  virtual bool HasFanout(const FaninViewT& fanout) const = 0;
+
+  // Returns an attribute of the node by key. If no attribute for such key
+  // exists, a `nullptr` is returned.
+  const AttrValue* GetAttr(absl::string_view attr_name) const {
+    return attrs_.Find(attr_name);
+  }
+
+  // Returns all attributes of the node.
+  const AttrSlice& GetAttrs() const { return attrs_; }
+
+  // Returns the number of attributes in the node.
+  int NumAttrs() const { return attrs_.size(); }
+
+  // Checks if an attribute exist in the node.
+  bool HasAttr(absl::string_view attr_name) const {
+    return attrs_.Find(attr_name) != nullptr;
+  }
+
+ protected:
+  virtual inline const FanoutViewT& GetMissingFanin() const = 0;
+  virtual inline const std::vector<FaninViewT>& GetMissingFanout() const = 0;
+
+  std::vector<FanoutViewT> regular_fanins_;
+  std::vector<FanoutViewT> controlling_fanins_;
+  std::vector<std::vector<FaninViewT>> regular_fanouts_by_port_;
+  int num_regular_fanouts_ = 0;
+  std::vector<FaninViewT> controlled_fanouts_;
+
+  GraphViewT* graph_view_;
+  int node_index_;
+  AttrSlice attrs_;
+};
+
+// GraphViewInternal is a helper class to simplify graph traversal. It creates
+// a view of the nodes and associated fanins and fanouts from the GraphDef
+// protocol buffer.
+//
+// There are two public classes implementing GraphViewInternal:
+//
+// - GraphView: constructed from `const GraphDef` and doesn't allow mutating
+//   the underlying graph and its nodes.
+// - MutableGraphView: constructed from `GraphDef` and allows mutating the
+//   underlying graph and its nodes.
+//
+// --------------------------- !!! WARNING !!! ---------------------------------
+//     Modifying the graph outside of implementations of GraphViewInternal
+//     (i.e. removing nodes from the GraphDef directly) may lead to
+//     segfaults! Guaranteed by absl::string_view!
+// -----------------------------------------------------------------------------
+//
+template <typename NodeViewT, typename FaninViewT, typename FanoutViewT,
+          bool IsConst>
+class GraphViewInternal {
+ private:
+  using GraphDefT =
+      typename std::conditional<IsConst, const GraphDef, GraphDef>::type;
+
+ public:
+  explicit GraphViewInternal(GraphDefT* graph) : graph_(graph) {}
+  virtual ~GraphViewInternal() {}
+
+  bool operator==(const GraphViewInternal& other) const {
+    return graph_ == other.graph_;
+  }
+
+  GraphDefT* graph() const { return graph_; }
+
+  // Finds node by index in the graph. If no such node exists in the graph, a
+  // `nullptr` is returned.
+  const NodeViewT* GetNode(int node_index) const {
+    if (node_index < 0 || node_index >= nodes_.size()) {
+      return nullptr;
+    }
+    return &nodes_[node_index];
+  }
+
+  NodeViewT* GetNode(int node_index) {
+    if (node_index < 0 || node_index >= nodes_.size()) {
+      return nullptr;
+    }
+    return &nodes_[node_index];
+  }
+
+  // Finds node by name. If no such node exists in the graph, a `nullptr` is
+  // returned.
+  const NodeViewT* GetNode(absl::string_view node_name) const {
+    auto it = node_index_by_name_.find(node_name);
+    if (it == node_index_by_name_.end()) {
+      return nullptr;
+    }
+    return &nodes_[it->second];
+  }
+
+  NodeViewT* GetNode(absl::string_view node_name) {
+    auto it = node_index_by_name_.find(node_name);
+    if (it == node_index_by_name_.end()) {
+      return nullptr;
+    }
+    return &nodes_[it->second];
+  }
+
+  // Returns all nodes (as NodeView) in the graph.
+  const std::vector<NodeViewT>& GetNodes() const { return nodes_; }
+
+  // Checks if a node by name exists in the graph.
+  bool HasNode(absl::string_view node_name) const {
+    return node_index_by_name_.contains(node_name);
+  }
+
+  // Returns the number of nodes in the graph.
+  int NumNodes() const { return nodes_.size(); }
+
+ protected:
+  // Reset allocated node vector and node map in case of failure.
+  void Reset() {
+    std::vector<NodeViewT>().swap(nodes_);
+    absl::flat_hash_map<absl::string_view, int>().swap(node_index_by_name_);
+  }
+
+  // nodes_[i] is a view of graph_.{mutable_}node(i).
+  std::vector<NodeViewT> nodes_;
+  absl::flat_hash_map<absl::string_view, int> node_index_by_name_;
+  GraphDefT* graph_;
+  const FanoutViewT missing_fanin_;
+  const std::vector<FaninViewT> missing_fanout_;
+};
+
+inline SafeTensorId EmptyTensorId() {
+  return SafeTensorId("", internal::kMissingSlot);
+}
+
+inline bool IsEmptyTensorId(const TensorId tensor_id) {
+  return tensor_id.node().empty() &&
+         tensor_id.index() == internal::kMissingSlot;
+}
+
+// NodeViewDiff is a helper struct holding changes to be made to an existing
+// node in GraphViewT. This should not be initialized or be used directly.
+template <typename GraphViewT>
+struct NodeViewDiff {
+  explicit NodeViewDiff(GraphViewT* graph_view, int node_index)
+      : graph_view(graph_view), node_index(node_index) {}
+
+  GraphViewT* graph_view;
+  int node_index;
+  bool removed = false;
+  string name;
+  bool update_name = false;
+  string op;
+  bool update_op = false;
+  string device;
+  bool update_device = false;
+  // Fanins to append after existing regular fanins.
+  std::vector<SafeTensorId> regular_inputs_to_add;
+  // Number of fanins to be appended. This is used for a quick comparison with
+  // `regular_inputs_to_add` for if there will be any missing inputs in the
+  // updated node.
+  int num_regular_inputs_to_add = 0;
+  // Fanins to update inplace.
+  std::map<int, SafeTensorId> regular_inputs_to_update;
+  // Fanins from end of regular fanins to remove. This keeps track of existing
+  // regular fanins in the original node to remove.
+  std::vector<bool> regular_inputs_to_remove;
+  // Number of fanins marked for removal. This is used for a quick comparison
+  // with `regular_inputs_to_remove` for if there will be any missing inputs
+  // in the updated node.
+  int num_regular_inputs_to_remove = 0;
+  absl::flat_hash_set<string> controlling_inputs_to_add;
+  std::set<int> controlling_inputs_to_remove;
+  absl::flat_hash_map<string, AttrValue> attrs_to_add;
+  absl::flat_hash_set<string> attrs_to_remove;
+  AttrValueMap processed_attrs;
+};
+
+// Sets node for removal via diff.
+template <typename GraphViewT>
+inline void SetRemoved(NodeViewDiff<GraphViewT>* diff, bool removed) {
+  diff->removed = removed;
+}
+
+// Updates node name. If `name` is the same as the name in the original node,
+// the field will be cleared in the diff.
+template <typename GraphViewT>
+inline void UpdateName(NodeViewDiff<GraphViewT>* diff, absl::string_view name) {
+  if (diff->graph_view->GetNode(diff->node_index)->GetName() == name) {
+    diff->name.clear();
+    diff->update_name = false;
+  } else {
+    diff->name = string(name);
+    diff->update_name = true;
+  }
+}
+
+// Updates node op. If `op` is the same as the op in the original node, the
+// field will be cleared in the diff.
+template <typename GraphViewT>
+inline void UpdateOp(NodeViewDiff<GraphViewT>* diff, absl::string_view op) {
+  if (diff->graph_view->GetNode(diff->node_index)->GetOp() == op) {
+    diff->op.clear();
+    diff->update_op = false;
+  } else {
+    diff->op = string(op);
+    diff->update_op = true;
+  }
+}
+
+// Updates node device. If `device` is the same as the device in the original
+// node, the field will be cleared in the diff.
+template <typename GraphViewT>
+inline void UpdateDevice(NodeViewDiff<GraphViewT>* diff,
+                         absl::string_view device) {
+  if (diff->graph_view->GetNode(diff->node_index)->GetDevice() == device) {
+    diff->device.clear();
+    diff->update_device = false;
+  } else {
+    diff->device = string(device);
+    diff->update_device = true;
+  }
+}
+
+// Adds or updates value in vector `v` at index `i`. This will also resize the
+// vector if index `i` is out of bounds, padding the vector with
+// `default_value`. Returns true if a new value was appended or if an update
+// occurred where an existing value was changed from `default_value`.
+template <typename T, typename U>
+inline bool AddOrUpdateAtIndex(std::vector<T>* v, int i, const U& value,
+                               const T& default_value) {
+  if (i > v->size()) {
+    // Resize to include `value`, filling the newly introduced gap with
+    // `default_value` for later checks of validity (gaps in vector).
+    v->reserve(i + 1);
+    v->resize(i, default_value);
+    v->push_back({value});
+  } else if (i == v->size()) {
+    // Vector is large enough, simply append `value` to the end.
+    v->push_back({value});
+  } else {
+    // Update existing value.
+    bool updated = (*v)[i] == default_value;
+    (*v)[i] = {value};
+    return updated;
+  }
+  return true;
+}
+
+// Checks if a node with name `node_name` will exist in the final mutated graph.
+template <typename GraphViewT>
+inline bool CheckNodeNameExists(
+    absl::string_view node_name,
+    const absl::flat_hash_map<absl::string_view, int>& updated_node_names,
+    const GraphViewT* graph_view) {
+  auto it = updated_node_names.find(node_name);
+  if (it != updated_node_names.end()) {
+    return it->second == kNodeNamePresent;
+  }
+  return graph_view->HasNode(node_name);
+}
+
+// Adds or updates regular fanin at `index` of regular fanins. If `index` is
+// less than the number of regular fanins in the original node, the fanin at
+// `index` in the original node will be updated with `fanin` if the fanin
+// differs. If `index` is greater than or equal to the number of regular fanins,
+// `fanin` will be added beyond the end of regular fanins at `index`.
+template <typename GraphViewT>
+inline void AddOrUpdateRegularFanin(NodeViewDiff<GraphViewT>* diff, int index,
+                                    const TensorId& fanin) {
+  if (index < 0) {
+    // Not a valid index for regular fanins.
+    return;
+  }
+  auto* node_view = diff->graph_view->GetNode(diff->node_index);
+  const int num_regular_fanins = node_view->NumRegularFanins();
+  if (index < num_regular_fanins) {  // Updating existing fanins.
+    // Calculate (relative) index from end of regular fanins, from absolute
+    // index from beginning of regular fanins.
+    const int relative_removal_index = num_regular_fanins - index - 1;
+    // Check if at relative index fanin was already marked for removal.
+    if (relative_removal_index < diff->regular_inputs_to_remove.size() &&
+        diff->regular_inputs_to_remove[relative_removal_index]) {
+      // Unmark fanin for removal.
+      diff->regular_inputs_to_remove[relative_removal_index] = false;
+      --diff->num_regular_inputs_to_remove;
+    }
+    const auto& existing_fanin = node_view->GetRegularFanin(index);
+    if (existing_fanin.index() != fanin.index() ||
+        existing_fanin.node_view()->GetName() != fanin.node()) {
+      // Update fanin if it is different from original fanin in node.
+      gtl::InsertOrUpdate(&diff->regular_inputs_to_update, index,
+                          SafeTensorId(fanin));
+    }
+  } else {
+    // Add fanin beyond current fanin range.
+    const int relative_add_index = index - num_regular_fanins;
+    if (AddOrUpdateAtIndex(&diff->regular_inputs_to_add, relative_add_index,
+                           fanin, EmptyTensorId())) {
+      // New fanin was added.
+      ++diff->num_regular_inputs_to_add;
+    }
+  }
+}
+
+// Remove regular fanin at `index` of regular fanins. This can remove existing
+// fanins and updated/added fanins via AddOrUpdateRegularFanins.
+template <typename GraphViewT>
+inline void RemoveRegularFanin(NodeViewDiff<GraphViewT>* diff, int index) {
+  if (index < 0) {
+    // Not a valid index for regular fanins.
+    return;
+  }
+  auto* node_view = diff->graph_view->GetNode(diff->node_index);
+  const int num_regular_fanins = node_view->NumRegularFanins();
+  if (index < num_regular_fanins) {  // Removing existing fanins.
+    // Remove updated fanin if it exists.
+    diff->regular_inputs_to_update.erase(index);
+    // Calculate (relative) index from end of regular fanins, from absolute
+    // index from beginning of regular fanins.
+    const int relative_removal_index = num_regular_fanins - index - 1;
+    if (AddOrUpdateAtIndex(&diff->regular_inputs_to_remove,
+                           relative_removal_index,
+                           /*value=*/true, /*default_value=*/false)) {
+      ++diff->num_regular_inputs_to_remove;
+    }
+  } else {
+    // Relative index from end of regular fanins.
+    const int relative_add_index = index - num_regular_fanins;
+    if (relative_add_index >= diff->regular_inputs_to_add.size() ||
+        IsEmptyTensorId(diff->regular_inputs_to_add[relative_add_index])) {
+      // At relative index, appended regular fanin was already marked for
+      // removal.
+      return;
+    }
+    // Remove added fanin.
+    diff->regular_inputs_to_add[relative_add_index] = EmptyTensorId();
+    --diff->num_regular_inputs_to_add;
+  }
+}
+
+// Adds controlling fanin. If the controlling fanin already exists in the
+// original node, it will be dedupped. If the controlling fanin is marked for
+// removal, this will reverse it.
+template <typename GraphViewT>
+inline void AddControllingFanin(NodeViewDiff<GraphViewT>* diff,
+                                int control_index,
+                                absl::string_view fanin_node_name) {
+  if (control_index == kMissingIndex) {
+    diff->controlling_inputs_to_add.emplace(fanin_node_name);
+  } else {
+    diff->controlling_inputs_to_remove.erase(control_index);
+  }
+}
+
+// Remove controlling fanin. If the controlling fanin does not exist in the
+// original node and diff, nothing will happen. If the controlling fanin exists
+// in the diff, it will be removed. Otherwise the controlling fanin will be
+// marked for removal from the original node.
+template <typename GraphViewT>
+inline void RemoveControllingFanin(NodeViewDiff<GraphViewT>* diff,
+                                   int control_index,
+                                   absl::string_view fanin_node_name) {
+  if (control_index == kMissingIndex) {
+    diff->controlling_inputs_to_add.erase(fanin_node_name);
+  } else {
+    diff->controlling_inputs_to_remove.emplace(control_index);
+  }
+}
+
+// Adds or updates an attribute by name. If an attribute exist in the original
+// node or diff (including those marked for removal), this will overwrite it.
+template <typename GraphViewT>
+inline void AddOrUpdateAttribute(NodeViewDiff<GraphViewT>* diff,
+                                 absl::string_view attr_name,
+                                 const AttrValue& attr_value) {
+  diff->attrs_to_remove.erase(attr_name);
+  gtl::InsertOrUpdate(&diff->attrs_to_add, string(attr_name), attr_value);
+}
+
+// Removes an attribute by name. If an attribute exist in the original node or
+// diff, this will remove it.
+template <typename GraphViewT>
+inline void RemoveAttribute(NodeViewDiff<GraphViewT>* diff,
+                            absl::string_view attr_name) {
+  diff->attrs_to_add.erase(attr_name);
+  auto* node_view = diff->graph_view->GetNode(diff->node_index);
+  if (node_view->HasAttr(attr_name)) {
+    diff->attrs_to_remove.emplace(attr_name);
+  }
+}
+
+// Removes trailing values in vector `v` for values equal to `value`.
+template <typename T>
+inline void ResizeByTrimmingEndForValue(std::vector<T>* v, const T& value) {
+  int curr_index = v->size();
+  const int last_index = v->size() - 1;
+  for (int i = last_index; i >= 0; --i) {
+    if ((*v)[i] == value) {
+      curr_index = i;
+    } else {
+      break;
+    }
+  }
+  if (curr_index <= last_index) {
+    v->resize(curr_index);
+  }
+}
+
+// Checks if any changes are set in the diff.
+template <typename GraphViewT>
+inline bool IsEmpty(NodeViewDiff<GraphViewT>* diff) {
+  ResizeByTrimmingEndForValue(&diff->regular_inputs_to_remove, false);
+  ResizeByTrimmingEndForValue(&diff->regular_inputs_to_add, EmptyTensorId());
+  return !diff->removed && !diff->update_name && !diff->update_op &&
+         !diff->update_device && diff->regular_inputs_to_add.empty() &&
+         diff->regular_inputs_to_update.empty() &&
+         diff->regular_inputs_to_remove.empty() &&
+         diff->controlling_inputs_to_add.empty() &&
+         diff->controlling_inputs_to_remove.empty() &&
+         diff->attrs_to_add.empty() && diff->attrs_to_remove.empty();
+}
+
+// Resets and clears existing diff.
+template <typename GraphViewT>
+inline void Reset(NodeViewDiff<GraphViewT>* diff) {
+  diff->removed = false;
+  diff->name.clear();
+  diff->update_name = false;
+  diff->op.clear();
+  diff->update_op = false;
+  diff->device.clear();
+  diff->update_device = false;
+  std::vector<SafeTensorId>().swap(diff->regular_inputs_to_add);
+  diff->num_regular_inputs_to_add = false;
+  std::map<int, SafeTensorId>().swap(diff->regular_inputs_to_update);
+  std::vector<bool>().swap(diff->regular_inputs_to_remove);
+  diff->num_regular_inputs_to_remove = 0;
+  absl::flat_hash_set<string>().swap(diff->controlling_inputs_to_add);
+  std::set<int>().swap(diff->controlling_inputs_to_remove);
+  absl::flat_hash_map<string, AttrValue>().swap(diff->attrs_to_add);
+  absl::flat_hash_set<string>().swap(diff->attrs_to_remove);
+}
+
+// Checks if changes to node will result in a valid node.
+template <typename GraphViewT>
+inline bool IsWellFormed(
+    NodeViewDiff<GraphViewT>* diff,
+    const absl::flat_hash_map<absl::string_view, int>& updated_node_names) {
+  ResizeByTrimmingEndForValue(&diff->regular_inputs_to_remove, false);
+  ResizeByTrimmingEndForValue(&diff->regular_inputs_to_add, EmptyTensorId());
+  if (diff->regular_inputs_to_add.size() != diff->num_regular_inputs_to_add) {
+    // Missing regular fanins in between appended fanins.
+    return false;
+  } else if (diff->num_regular_inputs_to_add > 0 &&
+             !diff->regular_inputs_to_remove.empty()) {
+    // Appending new fanins while removing existing fanins, resulting in missing
+    // regular fanins in between.
+    return false;
+  } else if (diff->regular_inputs_to_remove.size() !=
+             diff->num_regular_inputs_to_remove) {
+    // Regular fanins exist in between removed fanins.
+    return false;
+  }
+  auto* node_view = diff->graph_view->GetNode(diff->node_index);
+  const string& node_name =
+      diff->update_name ? diff->name : node_view->GetName();
+  auto invalid_node_name = [diff, updated_node_names,
+                            node_name](absl::string_view fanin_node_name) {
+    return fanin_node_name == node_name ||
+           !CheckNodeNameExists(fanin_node_name, updated_node_names,
+                                diff->graph_view);
+  };
+
+  // Check if nodes of all updated and new fanins exist (from name) and if such
+  // fanins do not introduce self loops. Note, this will not check for if
+  // unmodified fanins exist.
+  if (diff->update_name) {
+    // If name of node was changed in node, check all fanins. Updated fanins are
+    // checked for existence and self loops. Unmodified fanins are checked for
+    // self loops.
+    // `regular_inputs_to_update`, `controlling_inputs_to_remove` are sorted,
+    // so iterators from these maps/sets can be incremented alongside iteration
+    // and be used for comparisons.
+    const int last_index =
+        node_view->NumRegularFanins() - diff->num_regular_inputs_to_remove - 1;
+    auto regular_to_update_it = diff->regular_inputs_to_update.begin();
+    for (int i = 0; i <= last_index; ++i) {
+      if (regular_to_update_it != diff->regular_inputs_to_update.end() &&
+          regular_to_update_it->first < i) {
+        ++regular_to_update_it;
+      }
+      if (regular_to_update_it != diff->regular_inputs_to_update.end() &&
+          regular_to_update_it->first == i) {
+        if (invalid_node_name(regular_to_update_it->second.node())) {
+          return false;
+        }
+      } else {
+        const string& regular_name =
+            node_view->GetRegularFanin(i).node_view()->GetName();
+        if (regular_name == node_name) {
+          return false;
+        }
+      }
+    }
+
+    auto& controls = node_view->GetControllingFanins();
+    const int num_controls = controls.size();
+    auto control_to_remove_it = diff->controlling_inputs_to_remove.begin();
+    for (int i = 0; i < num_controls; ++i) {
+      if (control_to_remove_it != diff->controlling_inputs_to_remove.end() &&
+          *control_to_remove_it < i) {
+        ++control_to_remove_it;
+      }
+      if (control_to_remove_it != diff->controlling_inputs_to_remove.end() &&
+          *control_to_remove_it == i) {
+        // Control dependency marked for removal, can be ignored.
+        continue;
+      } else if (controls[i].node_view()->GetName() == node_name) {
+        return false;
+      }
+    }
+  } else {
+    // Name of node was not changed, check only updated fanins under the
+    // assumption prior fanins were valid.
+    for (const auto& updated : diff->regular_inputs_to_update) {
+      const string& fanin_name = updated.second.node();
+      if (invalid_node_name(fanin_name)) {
+        return false;
+      }
+    }
+  }
+  // Check appended regular fanins.
+  for (const auto& regular : diff->regular_inputs_to_add) {
+    if (invalid_node_name(regular.node())) {
+      return false;
+    }
+  }
+  // Check new controlling fanins.
+  for (const auto& control : diff->controlling_inputs_to_add) {
+    if (invalid_node_name(control)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// NewNode is a helper struct holding a new node to be added to a GraphViewT.
+// This should not be initialized or be used directly.
+template <typename GraphViewT>
+struct NewNode {
+  explicit NewNode(GraphViewT* graph_view, NodeDef&& node)
+      : graph_view(graph_view), node(std::move(node)) {}
+
+  GraphViewT* graph_view;
+  NodeDef node;
+  std::vector<SafeTensorId> regular_fanins;
+  int num_regular_fanins = 0;
+  absl::flat_hash_set<string> controlling_fanins;
+};
+
+// Updates new node name.
+template <typename GraphViewT>
+inline void UpdateName(NewNode<GraphViewT>* new_node, absl::string_view name) {
+  if (name.empty()) {
+    new_node->node.clear_name();
+  } else {
+    new_node->node.set_name(string(name));
+  }
+}
+
+// Updates new node op.
+template <typename GraphViewT>
+inline void UpdateOp(NewNode<GraphViewT>* new_node, absl::string_view op) {
+  if (op.empty()) {
+    new_node->node.clear_op();
+  } else {
+    new_node->node.set_op(string(op));
+  }
+}
+
+// Updates new node device.
+template <typename GraphViewT>
+inline void UpdateDevice(NewNode<GraphViewT>* new_node,
+                         absl::string_view device) {
+  if (device.empty()) {
+    new_node->node.clear_device();
+  } else {
+    new_node->node.set_device(string(device));
+  }
+}
+
+// Adds or updates regular fanin at `index` of regular fanins in the new node.
+// If another fanin already exists at `index`, it will be replaced with `fanin`.
+template <typename GraphViewT>
+inline void AddOrUpdateRegularFanin(NewNode<GraphViewT>* new_node, int index,
+                                    const TensorId& fanin) {
+  if (index < 0) {
+    // Not a valid index for regular fanins.
+    return;
+  } else if (AddOrUpdateAtIndex(&new_node->regular_fanins, index, fanin,
+                                EmptyTensorId())) {
+    ++new_node->num_regular_fanins;
+  }
+}
+
+// Remove regular fanin at `index` of regular fanins in the new node. This can
+// remove existing fanins and updated/added fanins via AddOrUpdateRegularFanins.
+template <typename GraphViewT>
+inline void RemoveRegularFanin(NewNode<GraphViewT>* new_node, int index) {
+  if (index < 0 || index >= new_node->regular_fanins.size() ||
+      IsEmptyTensorId(new_node->regular_fanins[index])) {
+    return;
+  }
+  new_node->regular_fanins[index] = EmptyTensorId();
+  --new_node->num_regular_fanins;
+}
+
+// Adds controlling fanin to new node.
+template <typename GraphViewT>
+inline void AddControllingFanin(NewNode<GraphViewT>* new_node,
+                                absl::string_view fanin_node_name) {
+  new_node->controlling_fanins.emplace(fanin_node_name);
+}
+
+// Removes controlling fanin to new node.
+template <typename GraphViewT>
+inline void RemoveControllingFanin(NewNode<GraphViewT>* new_node,
+                                   absl::string_view fanin_node_name) {
+  new_node->controlling_fanins.erase(fanin_node_name);
+}
+
+// Adds or updates an attribute by name to a new node.
+template <typename GraphViewT>
+inline void AddOrUpdateAttribute(NewNode<GraphViewT>* new_node,
+                                 absl::string_view attr_name,
+                                 const AttrValue& attr_value) {
+  gtl::InsertOrUpdate(new_node->node.mutable_attr(), string(attr_name),
+                      attr_value);
+}
+
+// Removes an attribute by name to a new node.
+template <typename GraphViewT>
+inline void RemoveAttribute(NewNode<GraphViewT>* new_node,
+                            absl::string_view attr_name) {
+  new_node->node.mutable_attr()->erase(string(attr_name));
+}
+
+// Checks if current state of new node is a valid node.
+template <typename GraphViewT>
+inline bool IsWellFormed(
+    NewNode<GraphViewT>* new_node,
+    const absl::flat_hash_map<absl::string_view, int>& updated_node_names) {
+  ResizeByTrimmingEndForValue(&new_node->regular_fanins, EmptyTensorId());
+  if (new_node->regular_fanins.size() != new_node->num_regular_fanins) {
+    return false;
+  }
+
+  const string& node_name = new_node->node.name();
+  auto invalid_node_name = [new_node, updated_node_names,
+                            node_name](absl::string_view fanin_node_name) {
+    return fanin_node_name == node_name ||
+           !CheckNodeNameExists(fanin_node_name, updated_node_names,
+                                new_node->graph_view);
+  };
+  // Check if nodes of all fanins exist (from name) and if fanins do not
+  // introduce self loops.
+  for (const auto& regular : new_node->regular_fanins) {
+    if (invalid_node_name(regular.node())) {
+      return false;
+    }
+  }
+  for (const auto& control : new_node->controlling_fanins) {
+    if (invalid_node_name(control)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+}  // namespace internal
+}  // namespace utils
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPH_VIEW_INTERNAL_H_
diff --git a/tensorflow/core/grappler/utils/graph_view_internal_test.cc b/tensorflow/core/grappler/utils/graph_view_internal_test.cc
new file mode 100644
index 00000000000..cb959aea16b
--- /dev/null
+++ b/tensorflow/core/grappler/utils/graph_view_internal_test.cc
@@ -0,0 +1,1112 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/graph_view_internal.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/grappler/utils/graph_view.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace utils {
+namespace internal {
+namespace {
+
+using ::tensorflow::test::function::GDef;
+using ::tensorflow::test::function::NDef;
+
+constexpr char kNodeOp[] = "NotImportant";
+
+GraphDef SimpleTestGraphForMutation() {
+  return GDef(
+      {NDef("a", kNodeOp, {}), NDef("b", kNodeOp, {}), NDef("c", kNodeOp, {}),
+       NDef("d", kNodeOp, {"a:2", "b:3", "a:4", "^c", "^b"},
+            {{"attr_1", "a"}, {"attr_2", 2.0f}}, "device_d")},
+      /*funcs=*/{});
+}
+
+absl::flat_hash_map<absl::string_view, int> GetUpdatedNodeNames(
+    const MutableGraphView* graph_view) {
+  absl::flat_hash_map<absl::string_view, int> updated_node_names;
+  updated_node_names.reserve(graph_view->NumNodes());
+  for (const auto& node_view : graph_view->GetNodes()) {
+    updated_node_names.emplace(node_view.GetName(), -1);
+  }
+  return updated_node_names;
+}
+
+using MutableNodeViewDiff = NodeViewDiff<MutableGraphView>;
+
+TEST(MutableNodeViewDiffTest, SetRemoved) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  SetRemoved(&diff, true);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  SetRemoved(&diff, false);
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, UpdateName) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  UpdateName(&diff, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  UpdateName(&diff, "d");
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, UpdateOp) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  UpdateOp(&diff, "RandomOp");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  UpdateOp(&diff, kNodeOp);
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, UpdateDevice) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  UpdateDevice(&diff, "random_device");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  UpdateDevice(&diff, "device_d");
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, AddOrUpdateRegularFanin) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Bad index.
+  AddOrUpdateRegularFanin(&diff, -1, {"a", 0});
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Set fanin to same existing fanin.
+  AddOrUpdateRegularFanin(&diff, 0, {"a", 2});
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Update existing fanin.
+  AddOrUpdateRegularFanin(&diff, 0, {"a", 3});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Add new fanin at index 4 resulting in missing fanin at index 3.
+  AddOrUpdateRegularFanin(&diff, 4, {"b", 4});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+
+  // Add new fanin at index 3.
+  AddOrUpdateRegularFanin(&diff, 3, {"c", 4});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Add new fanin at index 5.
+  AddOrUpdateRegularFanin(&diff, 5, {"c", 5});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, AddOrUpdateRegularFaninBetweenRemovedFanins) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  RemoveRegularFanin(&diff, 0);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  RemoveRegularFanin(&diff, 2);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 1, {"c", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 0, {"c", 0});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  RemoveRegularFanin(&diff, 0);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 2, {"c", 2});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, RemoveRegularFanin) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Bad index.
+  RemoveRegularFanin(&diff, -1);
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+  RemoveRegularFanin(&diff, 3);
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Add new fanin at index 4 resulting in missing fanin at index 3.
+  AddOrUpdateRegularFanin(&diff, 4, {"b", 4});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Remove fanin at index 4.
+  RemoveRegularFanin(&diff, 4);
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Add new fanin at index 4 resulting in missing fanin at index 3.
+  AddOrUpdateRegularFanin(&diff, 4, {"b", 4});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Add new fanin at index 3.
+  AddOrUpdateRegularFanin(&diff, 3, {"c", 4});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+  // Remove fanin at index 3.
+  RemoveRegularFanin(&diff, 3);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Remove fanin at index 4.
+  RemoveRegularFanin(&diff, 4);
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Add new fanin at index 5 resulting in missing fanin at indices 3 and 4.
+  AddOrUpdateRegularFanin(&diff, 5, {"b", 6});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Add new fanin at index 3 resulting in missing fanin at index 4.
+  AddOrUpdateRegularFanin(&diff, 3, {"c", 4});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Remove missing fanin at index 4.
+  RemoveRegularFanin(&diff, 4);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Remove fanin at index 3.
+  RemoveRegularFanin(&diff, 3);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Remove fanin at index 5.
+  RemoveRegularFanin(&diff, 5);
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Update existing fanin.
+  AddOrUpdateRegularFanin(&diff, 1, {"a", 3});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+  // Remove fanin at index 1.
+  RemoveRegularFanin(&diff, 1);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Add original fanin at index 1.
+  AddOrUpdateRegularFanin(&diff, 1, {"b", 3});
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, RemoveRegularFaninResize) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 3, {"c", 5});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+  AddOrUpdateRegularFanin(&diff, 4, {"c", 6});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+  AddOrUpdateRegularFanin(&diff, 5, {"c", 7});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  // Remove fanin in middle of appended regular fanins.
+  RemoveRegularFanin(&diff, 4);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+  // Remove last fanin in appended regular fanins.
+  RemoveRegularFanin(&diff, 5);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, AddControllingFanin) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, 0, "c");
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, kMissingIndex, "a");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, RemoveControllingFanin) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, kMissingIndex, "a");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  RemoveControllingFanin(&diff, 0, "c");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  RemoveControllingFanin(&diff, kMissingIndex, "a");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, 0, "c");
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, AddOrUpdateAttribute) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AttrValue attr_1;
+  attr_1.set_b(true);
+  AddOrUpdateAttribute(&diff, "attr_1", attr_1);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AttrValue attr_3;
+  attr_3.set_i(4);
+  AddOrUpdateAttribute(&diff, "attr_1", attr_3);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, RemoveAttribute) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AttrValue attr_1;
+  attr_1.set_b(true);
+  AddOrUpdateAttribute(&diff, "attr_1", attr_1);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  RemoveAttribute(&diff, "attr_1");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  RemoveAttribute(&diff, "attr_3");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, Reset) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  RemoveRegularFanin(&diff, 2);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, kMissingIndex, "a");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AttrValue attr_1;
+  attr_1.set_b(true);
+  AddOrUpdateAttribute(&diff, "attr_1", attr_1);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  Reset(&diff);
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedWithRemovedAndAppendedFanins) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  RemoveRegularFanin(&diff, 2);
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 3, {"a", 8});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedSelfLoopRegularUpdate) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 0, {"d", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedSelfLoopRegularNew) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 3, {"d", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedSelfLoopControl) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, kMissingIndex, "d");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedMissingFaninRegularUpdate) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 0, {"e", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedMissingFaninRegularNew) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 3, {"e", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedMissingControl) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, kMissingIndex, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedRenamedSelfLoopRegularUpdate) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  string old_node_name = "d";
+  string new_node_name = "e";
+  updated_node_names.erase(old_node_name);
+  updated_node_names.emplace(old_node_name, 3);
+  updated_node_names.emplace(new_node_name, -1);
+
+  UpdateName(&diff, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 0, {"e", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedRenamedSelfLoopRegularNew) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  string old_node_name = "d";
+  string new_node_name = "e";
+  updated_node_names.erase(old_node_name);
+  updated_node_names.emplace(old_node_name, 3);
+  updated_node_names.emplace(new_node_name, -1);
+
+  UpdateName(&diff, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 3, {"e", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedRenamedSelfLoopControl) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  string old_node_name = "d";
+  string new_node_name = "e";
+  updated_node_names.erase(old_node_name);
+  updated_node_names.emplace(old_node_name, 3);
+  updated_node_names.emplace(new_node_name, -1);
+
+  UpdateName(&diff, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, kMissingIndex, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedRenamedMissingFaninRegularUpdate) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  string old_node_name = "d";
+  string new_node_name = "e";
+  updated_node_names.erase(old_node_name);
+  updated_node_names.emplace(old_node_name, 3);
+  updated_node_names.emplace(new_node_name, -1);
+
+  UpdateName(&diff, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 0, {"f", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedRenamedMissingFaninRegularNew) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  string old_node_name = "d";
+  string new_node_name = "e";
+  updated_node_names.erase(old_node_name);
+  updated_node_names.emplace(old_node_name, 3);
+  updated_node_names.emplace(new_node_name, -1);
+
+  UpdateName(&diff, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddOrUpdateRegularFanin(&diff, 3, {"f", 1});
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, IsWellFormedRenamedMissingFaninControl) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  string old_node_name = "d";
+  string new_node_name = "e";
+  updated_node_names.erase(old_node_name);
+  updated_node_names.emplace(old_node_name, 3);
+  updated_node_names.emplace(new_node_name, -1);
+
+  UpdateName(&diff, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  AddControllingFanin(&diff, kMissingIndex, "f");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, RenamedAndRemovedFanins) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  string old_node_name = "d";
+  string new_node_name = "e";
+  updated_node_names.erase(old_node_name);
+  updated_node_names.emplace(old_node_name, 3);
+  updated_node_names.emplace(new_node_name, -1);
+
+  UpdateName(&diff, "e");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  for (int i = 0; i < 3; ++i) {
+    RemoveRegularFanin(&diff, i);
+  }
+  RemoveControllingFanin(&diff, 0, "c");
+  RemoveControllingFanin(&diff, 0, "b");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+}
+
+TEST(MutableNodeViewDiffTest, RenamedWithSelfLoopControl) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  MutableNodeViewDiff diff(&graph_view, d_node->node_index());
+  EXPECT_TRUE(IsEmpty(&diff));
+  EXPECT_TRUE(IsWellFormed(&diff, updated_node_names));
+
+  updated_node_names.erase("d");
+
+  UpdateName(&diff, "c");
+  EXPECT_FALSE(IsEmpty(&diff));
+  EXPECT_FALSE(IsWellFormed(&diff, updated_node_names));
+}
+
+using MutationNewNodeForTest = NewNode<MutableGraphView>;
+
+TEST(MutationNewNodeTest, UpdateName) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateName(&new_node, "new");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateName(&new_node, "");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+TEST(MutationNewNodeTest, UpdateOp) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateOp(&new_node, "Identity");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateOp(&new_node, "");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+TEST(MutationNewNodeTest, UpdateDevice) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateDevice(&new_node, "foo_device");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateDevice(&new_node, "");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+TEST(MutationNewNodeTest, AddOrUpdateRegularFanin) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateName(&new_node, "new");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  // Bad index.
+  AddOrUpdateRegularFanin(&new_node, -1, {"a", 1});
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  // Fanin at index 0 is missing.
+  AddOrUpdateRegularFanin(&new_node, 1, {"a", 1});
+  EXPECT_FALSE(IsWellFormed(&new_node, updated_node_names));
+  AddOrUpdateRegularFanin(&new_node, 0, {"b", 2});
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  AddOrUpdateRegularFanin(&new_node, 2, {"c", 3});
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  // Update inplace.
+  AddOrUpdateRegularFanin(&new_node, 1, {"d", 4});
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  // Missing fanin.
+  AddOrUpdateRegularFanin(&new_node, 1, {"e", 5});
+  EXPECT_FALSE(IsWellFormed(&new_node, updated_node_names));
+
+  // Self loop.
+  AddOrUpdateRegularFanin(&new_node, 1, {"new", 6});
+  EXPECT_FALSE(IsWellFormed(&new_node, updated_node_names));
+
+  AddOrUpdateRegularFanin(&new_node, 1, {"d", 4});
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+TEST(MutationNewNodeTest, RemoveRegularFanin) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateName(&new_node, "new");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  AddOrUpdateRegularFanin(&new_node, 0, {"a", 1});
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  AddOrUpdateRegularFanin(&new_node, 1, {"b", 2});
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  AddOrUpdateRegularFanin(&new_node, 2, {"c", 3});
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  RemoveRegularFanin(&new_node, 3);
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  RemoveRegularFanin(&new_node, 2);
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  RemoveRegularFanin(&new_node, 0);
+  EXPECT_FALSE(IsWellFormed(&new_node, updated_node_names));
+  RemoveRegularFanin(&new_node, 1);
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+TEST(MutationNewNodeTest, AddControllingFanin) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateName(&new_node, "new");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  AddControllingFanin(&new_node, "a");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  // Missing fanin.
+  AddControllingFanin(&new_node, "e");
+  EXPECT_FALSE(IsWellFormed(&new_node, updated_node_names));
+
+  // Self loop.
+  AddControllingFanin(&new_node, "new");
+  EXPECT_FALSE(IsWellFormed(&new_node, updated_node_names));
+
+  RemoveControllingFanin(&new_node, "e");
+  RemoveControllingFanin(&new_node, "new");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+TEST(MutationNewNodeTest, RemoveControllingFanin) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  UpdateName(&new_node, "new");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  AddControllingFanin(&new_node, "a");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  RemoveControllingFanin(&new_node, "e");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  RemoveControllingFanin(&new_node, "new");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+
+  RemoveControllingFanin(&new_node, "a");
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+TEST(MutationNewNodeTest, AddOrUpdateAttribute) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  string attr_name = "attr_name";
+  AttrValue attr_1;
+  attr_1.set_i(8);
+  AddOrUpdateAttribute(&new_node, attr_name, attr_1);
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  AttrValue attr_2;
+  attr_2.set_f(2.0f);
+  AddOrUpdateAttribute(&new_node, attr_name, attr_2);
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+TEST(MutationNewNodeTest, RemoveAttribute) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  auto updated_node_names = GetUpdatedNodeNames(&graph_view);
+
+  MutationNewNodeForTest new_node(&graph_view, {});
+
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  string attr_name = "attr_name";
+  AttrValue attr_1;
+  attr_1.set_i(8);
+  AddOrUpdateAttribute(&new_node, attr_name, attr_1);
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  RemoveAttribute(&new_node, attr_name);
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+  RemoveAttribute(&new_node, attr_name);
+  EXPECT_TRUE(IsWellFormed(&new_node, updated_node_names));
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/graph_view_test.cc b/tensorflow/core/grappler/utils/graph_view_test.cc
new file mode 100644
index 00000000000..ba2c9c31bb9
--- /dev/null
+++ b/tensorflow/core/grappler/utils/graph_view_test.cc
@@ -0,0 +1,2830 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/graph_view.h"
+
+#include <type_traits>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/benchmark_testlib.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace utils {
+namespace {
+
+using ::tensorflow::test::function::GDef;
+using ::tensorflow::test::function::NDef;
+
+constexpr char kNoOp[] = "NoOp";
+
+GraphDef SimpleTestGraph() {
+  return GDef({NDef("a", kNoOp, {"b:2", "d:3", "b:2", "d:3", "^c"}),
+               NDef("b", kNoOp, {"d:2", "c:5", "^c"}),
+               NDef("c", kNoOp, {"^d", "^d"}), NDef("d", kNoOp, {})},
+              /*funcs=*/{});
+}
+
+template <typename T>
+const string GetGraphViewTypeAsString() {
+  return std::is_same<T, class GraphView>::value ? "GraphView"
+                                                 : "MutableGraphView";
+}
+
+using GraphViewTypes = ::testing::Types<GraphView, MutableGraphView>;
+
+template <typename T>
+class TypedGraphViewTest : public ::testing::Test {};
+TYPED_TEST_SUITE(TypedGraphViewTest, GraphViewTypes);
+
+TYPED_TEST(TypedGraphViewTest, GraphWithDuplicateNodeNames) {
+  GraphDef graph =
+      GDef({NDef("a", kNoOp, {}), NDef("a", kNoOp, {})}, /*funcs=*/{});
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            absl::Substitute(
+                "$0::$0 error: graph has multiple nodes with the name 'a'.",
+                GetGraphViewTypeAsString<TypeParam>()));
+}
+
+TYPED_TEST(TypedGraphViewTest, GraphWithMissingFanins) {
+  GraphDef graph = GDef({NDef("a", kNoOp, {"b:3"})}, /*funcs=*/{});
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            absl::Substitute("$0::$0 error: node 'a' has missing fanin 'b:3'.",
+                             GetGraphViewTypeAsString<TypeParam>()));
+}
+
+TYPED_TEST(TypedGraphViewTest, GraphWithSelfCycles) {
+  GraphDef graph = GDef({NDef("a", kNoOp, {"a:4"})}, /*funcs=*/{});
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(
+      s.error_message(),
+      absl::Substitute("$0::$0 error: node 'a' has self cycle fanin 'a:4'.",
+                       GetGraphViewTypeAsString<TypeParam>()));
+}
+
+TYPED_TEST(TypedGraphViewTest, GraphWithMisorderedFanins) {
+  GraphDef graph = GDef({NDef("a", kNoOp, {"^b", "b:4"}), NDef("b", kNoOp, {})},
+                        /*funcs=*/{});
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            absl::Substitute("$0::$0 error: node 'a' has regular fanin 'b:4' "
+                             "after controlling fanins.",
+                             GetGraphViewTypeAsString<TypeParam>()));
+}
+
+TYPED_TEST(TypedGraphViewTest, GetNodeWithIndex) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  const int num_nodes = graph_view.NumNodes();
+  ASSERT_EQ(graph_view.NumNodes(), graph.node_size());
+  for (int i = 0; i < num_nodes; ++i) {
+    const auto* node = graph_view.GetNode(i);
+    ASSERT_NE(node, nullptr);
+    EXPECT_EQ(node->node(), graph.mutable_node(i));
+  }
+
+  const auto* bad_node = graph_view.GetNode(-1);
+  ASSERT_EQ(bad_node, nullptr);
+  bad_node = graph_view.GetNode(num_nodes);
+  ASSERT_EQ(bad_node, nullptr);
+}
+
+TYPED_TEST(TypedGraphViewTest, GetNodeWithName) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  std::vector<string> node_names = {"a", "b", "c", "d"};
+  for (int i = 0; i < node_names.size(); ++i) {
+    const string& node_name = node_names[i];
+    const auto* node = graph_view.GetNode(node_name);
+    ASSERT_NE(node, nullptr);
+    EXPECT_EQ(node->node(), graph.mutable_node(i));
+  }
+
+  // Missing node.
+  const auto* bad_node = graph_view.GetNode("e");
+  ASSERT_EQ(bad_node, nullptr);
+}
+
+TYPED_TEST(TypedGraphViewTest, GetNodes) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  const auto& nodes = graph_view.GetNodes();
+  const int num_nodes = nodes.size();
+  EXPECT_EQ(num_nodes, 4);
+
+  ASSERT_EQ(num_nodes, graph.node_size());
+  for (int i = 0; i < num_nodes; ++i) {
+    EXPECT_EQ(nodes[i].node(), graph.mutable_node(i));
+  }
+}
+
+TYPED_TEST(TypedGraphViewTest, HasNode) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  for (const string& node_name : {"a", "b", "c", "d"}) {
+    EXPECT_TRUE(graph_view.HasNode(node_name));
+  }
+
+  // Missing node.
+  EXPECT_FALSE(graph_view.HasNode("e"));
+}
+
+TYPED_TEST(TypedGraphViewTest, NumNodes) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  EXPECT_EQ(graph_view.NumNodes(), 4);
+}
+
+TYPED_TEST(TypedGraphViewTest, NumNodesEmptyGraph) {
+  GraphDef graph;
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  EXPECT_EQ(graph_view.NumNodes(), 0);
+}
+
+TEST(MutableGraphViewTest, DedupControlDependencies) {
+  GraphDef graph = GDef(
+      {NDef("a", kNoOp, {}), NDef("b", kNoOp, {}), NDef("c", kNoOp, {}),
+       NDef("d", kNoOp, {"a:2", "b:1", "^c", "^c", "^a", "^a", "^b", "^c"})},
+      /*funcs=*/{});
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+  EXPECT_EQ(graph_view.NumNodes(), 4);
+
+  const auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  const auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  const auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  const auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  EXPECT_EQ(d_node->NumRegularFanins(), 2);
+  ASSERT_NE(d_node->node(), nullptr);
+  ASSERT_EQ(d_node->node()->input_size(), 5);
+  EXPECT_EQ(d_node->node()->input(0), "a:2");
+  EXPECT_EQ(d_node->node()->input(1), "b:1");
+  EXPECT_EQ(d_node->node()->input(2), "^c");
+  EXPECT_EQ(d_node->node()->input(3), "^b");
+  EXPECT_EQ(d_node->node()->input(4), "^a");
+  ASSERT_EQ(d_node->NumControllingFanins(), 3);
+  const auto& d_control_fanins = d_node->GetControllingFanins();
+  ASSERT_EQ(d_control_fanins.size(), 3);
+  ASSERT_NE(d_control_fanins[0].node_view(), nullptr);
+  EXPECT_EQ(d_control_fanins[0].node_view()->GetName(), "c");
+  ASSERT_NE(d_control_fanins[1].node_view(), nullptr);
+  EXPECT_EQ(d_control_fanins[1].node_view()->GetName(), "b");
+  ASSERT_NE(d_control_fanins[2].node_view(), nullptr);
+  EXPECT_EQ(d_control_fanins[2].node_view()->GetName(), "a");
+}
+
+template <typename T>
+class TypedNodeViewTest : public ::testing::Test {};
+TYPED_TEST_SUITE(TypedNodeViewTest, GraphViewTypes);
+
+TYPED_TEST(TypedNodeViewTest, GetName) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  for (const NodeDef& node : graph.node()) {
+    const auto* node_view = graph_view.GetNode(node.name());
+    ASSERT_NE(node_view, nullptr);
+    EXPECT_EQ(node_view->GetName(), node.name());
+    EXPECT_EQ(node_view->GetName(), node_view->node()->name());
+  }
+}
+
+TYPED_TEST(TypedNodeViewTest, GetOp) {
+  GraphDef graph = GDef({NDef("a", "op_a", {}), NDef("b", "op_b", {}),
+                         NDef("c", "op_c", {}), NDef("d", "op_d", {})},
+                        /*funcs=*/{});
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  const auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  EXPECT_EQ(a_node->GetOp(), "op_a");
+  EXPECT_EQ(a_node->node()->op(), "op_a");
+  const auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  EXPECT_EQ(b_node->GetOp(), "op_b");
+  EXPECT_EQ(b_node->node()->op(), "op_b");
+  const auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  EXPECT_EQ(c_node->GetOp(), "op_c");
+  EXPECT_EQ(c_node->node()->op(), "op_c");
+  const auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  EXPECT_EQ(d_node->GetOp(), "op_d");
+  EXPECT_EQ(d_node->node()->op(), "op_d");
+}
+
+TYPED_TEST(TypedNodeViewTest, GetDevice) {
+  GraphDef graph = GDef(
+      {NDef("a", "", {}, {}, "device_a"), NDef("b", "", {}, {}, "device_b"),
+       NDef("c", "", {}, {}, "device_c"), NDef("d", "", {}, {})},
+      /*funcs=*/{});
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  const auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  EXPECT_EQ(a_node->GetDevice(), "device_a");
+  EXPECT_EQ(a_node->node()->device(), "device_a");
+  const auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  EXPECT_EQ(b_node->GetDevice(), "device_b");
+  EXPECT_EQ(b_node->node()->device(), "device_b");
+  const auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  EXPECT_EQ(c_node->GetDevice(), "device_c");
+  EXPECT_EQ(c_node->node()->device(), "device_c");
+  const auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  EXPECT_EQ(d_node->GetDevice(), "");
+  EXPECT_EQ(d_node->node()->device(), "");
+}
+
+template <typename T>
+class TypedFaninTest : public ::testing::Test {};
+using FaninTypes =
+    ::testing::Types<std::pair<FanoutView, GraphView>,
+                     std::pair<MutableFanoutView, MutableGraphView>>;
+TYPED_TEST_SUITE(TypedFaninTest, FaninTypes);
+
+TYPED_TEST(TypedFaninTest, GetRegularFanins) {
+  using FanoutViewType = typename TypeParam::first_type;
+  using GraphViewType = typename TypeParam::second_type;
+
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  GraphViewType graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  const auto& a_fanins = a_node->GetRegularFanins();
+  ASSERT_EQ(a_fanins.size(), 4);
+  EXPECT_EQ(a_fanins[0], FanoutViewType(&graph_view, b_node->node_index(), 2));
+  EXPECT_EQ(a_fanins[1], FanoutViewType(&graph_view, d_node->node_index(), 3));
+  EXPECT_EQ(a_fanins[2], FanoutViewType(&graph_view, b_node->node_index(), 2));
+  EXPECT_EQ(a_fanins[3], FanoutViewType(&graph_view, d_node->node_index(), 3));
+
+  const auto& d_fanins = d_node->GetRegularFanins();
+  EXPECT_EQ(d_fanins.size(), 0);
+}
+
+TYPED_TEST(TypedFaninTest, GetRegularFanin) {
+  using FanoutViewType = typename TypeParam::first_type;
+  using GraphViewType = typename TypeParam::second_type;
+
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  GraphViewType graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  const auto& a_fanin_0 = a_node->GetRegularFanin(0);
+  EXPECT_EQ(a_fanin_0, FanoutViewType(&graph_view, b_node->node_index(), 2));
+  const auto& a_fanin_1 = a_node->GetRegularFanin(1);
+  EXPECT_EQ(a_fanin_1, FanoutViewType(&graph_view, d_node->node_index(), 3));
+  const auto& a_fanin_2 = a_node->GetRegularFanin(2);
+  EXPECT_EQ(a_fanin_2, FanoutViewType(&graph_view, b_node->node_index(), 2));
+  const auto& a_fanin_3 = a_node->GetRegularFanin(3);
+  EXPECT_EQ(a_fanin_3, FanoutViewType(&graph_view, d_node->node_index(), 3));
+
+  // Out of bounds.
+  const FanoutViewType missing_fanin;
+  EXPECT_EQ(missing_fanin, FanoutViewType(nullptr, -1, -2));
+  EXPECT_EQ(missing_fanin.node_view(), nullptr);
+  const auto& a_fanin_4 = a_node->GetRegularFanin(4);
+  EXPECT_EQ(a_fanin_4, missing_fanin);
+  const auto& a_fanin_5 = a_node->GetRegularFanin(5);
+  EXPECT_EQ(a_fanin_5, missing_fanin);
+  const auto& a_fanin_control = a_node->GetRegularFanin(Graph::kControlSlot);
+  EXPECT_EQ(a_fanin_control, missing_fanin);
+  const auto& a_fanin_bad = a_node->GetRegularFanin(-2);
+  EXPECT_EQ(a_fanin_bad, missing_fanin);
+}
+
+TYPED_TEST(TypedFaninTest, GetControllingFanins) {
+  using FanoutViewType = typename TypeParam::first_type;
+  using GraphViewType = typename TypeParam::second_type;
+
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  GraphViewType graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  const auto& a_fanins = a_node->GetControllingFanins();
+  ASSERT_EQ(a_fanins.size(), 1);
+  EXPECT_EQ(a_fanins[0], FanoutViewType(&graph_view, c_node->node_index(),
+                                        Graph::kControlSlot));
+
+  const auto& c_fanins = c_node->GetControllingFanins();
+  FanoutViewType d_control_fanin(&graph_view, d_node->node_index(),
+                                 Graph::kControlSlot);
+  if (std::is_same<GraphViewType, GraphView>::value) {
+    ASSERT_EQ(c_fanins.size(), 2);
+    EXPECT_EQ(c_fanins[0], d_control_fanin);
+    EXPECT_EQ(c_fanins[1], d_control_fanin);
+  } else {  // MutableGraphView will dedup control dependency.
+    ASSERT_EQ(c_fanins.size(), 1);
+    EXPECT_EQ(c_fanins[0], d_control_fanin);
+  }
+
+  const auto& d_fanins = d_node->GetControllingFanins();
+  EXPECT_EQ(d_fanins.size(), 0);
+}
+
+template <typename T>
+class TypedFanoutTest : public ::testing::Test {};
+using FanoutTypes =
+    ::testing::Types<std::pair<FaninView, GraphView>,
+                     std::pair<MutableFaninView, MutableGraphView>>;
+TYPED_TEST_SUITE(TypedFanoutTest, FanoutTypes);
+
+TYPED_TEST(TypedFanoutTest, GetRegularFanouts) {
+  using FaninViewType = typename TypeParam::first_type;
+  using GraphViewType = typename TypeParam::second_type;
+
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  GraphViewType graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  const auto& d_fanouts = d_node->GetRegularFanouts();
+  ASSERT_EQ(d_fanouts.size(), 4);
+  for (int i = 0; i < d_fanouts.size(); ++i) {
+    if (i == 2) {
+      ASSERT_EQ(d_fanouts[i].size(), 1);
+      EXPECT_EQ(d_fanouts[i][0],
+                FaninViewType(&graph_view, b_node->node_index(), 0));
+    } else if (i == 3) {
+      ASSERT_EQ(d_fanouts[i].size(), 2);
+      absl::flat_hash_set<FaninViewType> fanouts(d_fanouts[i].begin(),
+                                                 d_fanouts[i].end());
+      EXPECT_TRUE(fanouts.contains(
+          FaninViewType(&graph_view, a_node->node_index(), 1)));
+      EXPECT_TRUE(fanouts.contains(
+          FaninViewType(&graph_view, a_node->node_index(), 3)));
+    } else {
+      EXPECT_EQ(d_fanouts[i].size(), 0);
+    }
+  }
+
+  const auto& a_fanouts = a_node->GetRegularFanouts();
+  EXPECT_EQ(a_fanouts.size(), 0);
+}
+
+TYPED_TEST(TypedFanoutTest, GetRegularFanout) {
+  using FaninViewType = typename TypeParam::first_type;
+  using GraphViewType = typename TypeParam::second_type;
+
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  GraphViewType graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  const auto& d_fanouts_2 = d_node->GetRegularFanout(2);
+  ASSERT_EQ(d_fanouts_2.size(), 1);
+  EXPECT_EQ(d_fanouts_2.at(0),
+            FaninViewType(&graph_view, b_node->node_index(), 0));
+
+  const auto& d_fanouts_3 = d_node->GetRegularFanout(3);
+  EXPECT_EQ(d_fanouts_3.size(), 2);
+  absl::flat_hash_set<FaninViewType> d_fanouts_3_set(d_fanouts_3.begin(),
+                                                     d_fanouts_3.end());
+  EXPECT_TRUE(d_fanouts_3_set.contains(
+      FaninViewType(&graph_view, a_node->node_index(), 1)));
+  EXPECT_TRUE(d_fanouts_3_set.contains(
+      FaninViewType(&graph_view, a_node->node_index(), 3)));
+
+  // Invalid or empty.
+  const std::vector<FaninViewType> no_fanouts;
+  EXPECT_EQ(d_node->GetRegularFanout(-2), no_fanouts);
+  EXPECT_EQ(d_node->GetRegularFanout(Graph::kControlSlot), no_fanouts);
+  EXPECT_EQ(d_node->GetRegularFanout(0), no_fanouts);
+  EXPECT_EQ(d_node->GetRegularFanout(1), no_fanouts);
+  EXPECT_EQ(d_node->GetRegularFanout(4), no_fanouts);
+  EXPECT_EQ(d_node->GetRegularFanout(5), no_fanouts);
+}
+
+TYPED_TEST(TypedFanoutTest, GetControlledFanouts) {
+  using FaninViewType = typename TypeParam::first_type;
+  using GraphViewType = typename TypeParam::second_type;
+
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  GraphViewType graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  const auto& c_fanouts = c_node->GetControlledFanouts();
+  EXPECT_EQ(c_fanouts.size(), 2);
+  absl::flat_hash_set<FaninViewType> c_fanouts_set(c_fanouts.begin(),
+                                                   c_fanouts.end());
+  EXPECT_TRUE(c_fanouts_set.contains(
+      FaninViewType(&graph_view, b_node->node_index(), Graph::kControlSlot)));
+  EXPECT_TRUE(c_fanouts_set.contains(
+      FaninViewType(&graph_view, a_node->node_index(), Graph::kControlSlot)));
+
+  const auto& d_fanouts = d_node->GetControlledFanouts();
+  FaninViewType c_control_fanout(&graph_view, c_node->node_index(),
+                                 Graph::kControlSlot);
+  if (std::is_same<GraphViewType, GraphView>::value) {
+    ASSERT_EQ(d_fanouts.size(), 2);
+    EXPECT_EQ(d_fanouts[0], c_control_fanout);
+    EXPECT_EQ(d_fanouts[1], c_control_fanout);
+  } else {  // MutableGraphView will dedup control dependency.
+    ASSERT_EQ(d_fanouts.size(), 1);
+    EXPECT_EQ(d_fanouts[0], c_control_fanout);
+  }
+
+  const auto& a_fanouts = a_node->GetControlledFanouts();
+  EXPECT_EQ(a_fanouts.size(), 0);
+}
+
+TYPED_TEST(TypedNodeViewTest, NumRegularFanins) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  EXPECT_EQ(a_node->NumRegularFanins(), 4);
+  EXPECT_EQ(b_node->NumRegularFanins(), 2);
+  EXPECT_EQ(c_node->NumRegularFanins(), 0);
+  EXPECT_EQ(d_node->NumRegularFanins(), 0);
+}
+
+TYPED_TEST(TypedNodeViewTest, NumControllingFanins) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  EXPECT_EQ(a_node->NumControllingFanins(), 1);
+  EXPECT_EQ(b_node->NumControllingFanins(), 1);
+  if (std::is_same<TypeParam, GraphView>::value) {
+    EXPECT_EQ(c_node->NumControllingFanins(), 2);
+  } else {
+    EXPECT_EQ(c_node->NumControllingFanins(), 1);
+  }
+  EXPECT_EQ(d_node->NumControllingFanins(), 0);
+}
+
+TYPED_TEST(TypedNodeViewTest, NumRegularFanouts) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  EXPECT_EQ(a_node->NumRegularFanouts(), 0);
+  EXPECT_EQ(b_node->NumRegularFanouts(), 2);
+  EXPECT_EQ(c_node->NumRegularFanouts(), 1);
+  EXPECT_EQ(d_node->NumRegularFanouts(), 3);
+}
+
+TYPED_TEST(TypedNodeViewTest, NumControlledFanouts) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  EXPECT_EQ(a_node->NumControlledFanouts(), 0);
+  EXPECT_EQ(b_node->NumControlledFanouts(), 0);
+  EXPECT_EQ(c_node->NumControlledFanouts(), 2);
+  if (std::is_same<TypeParam, GraphView>::value) {
+    EXPECT_EQ(d_node->NumControlledFanouts(), 2);
+  } else {
+    EXPECT_EQ(d_node->NumControlledFanouts(), 1);
+  }
+}
+
+TYPED_TEST(TypedNodeViewTest, HasFanin) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+
+  // Existing regular fanin.
+  EXPECT_TRUE(a_node->HasFanin({&graph_view, b_node->node_index(), 2}));
+  // Missing regular fanin.
+  EXPECT_FALSE(a_node->HasFanin({&graph_view, c_node->node_index(), 4}));
+  // Existing controlling fanin.
+  EXPECT_TRUE(a_node->HasFanin(
+      {&graph_view, c_node->node_index(), Graph::kControlSlot}));
+  // Missing controlling fanin.
+  EXPECT_FALSE(a_node->HasFanin(
+      {&graph_view, b_node->node_index(), Graph::kControlSlot}));
+  // Bad fanins.
+  EXPECT_FALSE(a_node->HasFanin({&graph_view, a_node->node_index(), 0}));
+  EXPECT_FALSE(a_node->HasFanin(
+      {&graph_view, b_node->node_index(), internal::kMissingSlot}));
+}
+
+TYPED_TEST(TypedNodeViewTest, HasFanout) {
+  GraphDef graph = SimpleTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  auto* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  // Existing regular fanout.
+  EXPECT_TRUE(b_node->HasFanout({&graph_view, a_node->node_index(), 2}));
+  // Missing regular fanout.
+  EXPECT_FALSE(b_node->HasFanout({&graph_view, a_node->node_index(), 1}));
+  // Existing controlled fanout.
+  EXPECT_TRUE(d_node->HasFanout(
+      {&graph_view, c_node->node_index(), Graph::kControlSlot}));
+  // Missing controlled fanout.
+  EXPECT_FALSE(d_node->HasFanout(
+      {&graph_view, a_node->node_index(), Graph::kControlSlot}));
+  // Bad fanouts.
+  EXPECT_FALSE(d_node->HasFanout({&graph_view, d_node->node_index(), 0}));
+  EXPECT_FALSE(a_node->HasFanout({&graph_view, b_node->node_index(), 0}));
+  EXPECT_FALSE(a_node->HasFanout({&graph_view, 4, 0}));
+  EXPECT_FALSE(d_node->HasFanout(
+      {&graph_view, b_node->node_index(), internal::kMissingSlot}));
+}
+
+GraphDef SimpleAttrTestGraph() {
+  return GDef({NDef("a", kNoOp, {}), NDef("b", kNoOp, {}, {{"attr", 1}}),
+               NDef("c", kNoOp, {}, {{"attr_1", "a"}, {"attr_2", 2.0f}})},
+              /*funcs=*/{});
+}
+
+TYPED_TEST(TypedNodeViewTest, GetAttr) {
+  GraphDef graph = SimpleAttrTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+
+  EXPECT_EQ(c_node->GetAttr("attr_1")->s(), "a");
+}
+
+TYPED_TEST(TypedNodeViewTest, GetAttrs) {
+  GraphDef graph = SimpleAttrTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+
+  const auto& actual_attrs = c_node->GetAttrs();
+  EXPECT_EQ(actual_attrs.size(), 2);
+  const auto* attr_1 = actual_attrs.Find("attr_1");
+  EXPECT_NE(attr_1, nullptr);
+  EXPECT_EQ(attr_1->s(), "a");
+  const auto* attr_2 = actual_attrs.Find("attr_2");
+  EXPECT_NE(attr_2, nullptr);
+  EXPECT_EQ(attr_2->f(), 2.0f);
+}
+
+TYPED_TEST(TypedNodeViewTest, NumAttrs) {
+  GraphDef graph = SimpleAttrTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  auto* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+
+  EXPECT_EQ(a_node->NumAttrs(), 0);
+  EXPECT_EQ(b_node->NumAttrs(), 1);
+  EXPECT_EQ(c_node->NumAttrs(), 2);
+}
+
+TYPED_TEST(TypedNodeViewTest, HasAttr) {
+  GraphDef graph = SimpleAttrTestGraph();
+
+  Status s;
+  TypeParam graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  auto* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+
+  EXPECT_TRUE(c_node->HasAttr("attr_1"));
+  EXPECT_FALSE(c_node->HasAttr("attr"));
+}
+
+class CompareGraphTest : public GrapplerTest {
+ public:
+  void CompareGraphViewWithGraph(MutableGraphView* graph_view,
+                                 const GraphDef& expected_graph) {
+    Status s;
+    GraphView expected_graph_view(&expected_graph, &s);
+    TF_ASSERT_OK(s);
+
+    EXPECT_EQ(graph_view->NumNodes(), expected_graph_view.NumNodes());
+
+    for (const NodeView& expected_node_view : expected_graph_view.GetNodes()) {
+      const string& node_name = expected_node_view.GetName();
+      MutableNodeView* node_view = graph_view->GetNode(node_name);
+      ASSERT_NE(node_view, nullptr);
+
+      EXPECT_EQ(node_view->GetName(), expected_node_view.GetName());
+
+      EXPECT_EQ(node_view->GetOp(), expected_node_view.GetOp());
+
+      EXPECT_EQ(node_view->GetDevice(), expected_node_view.GetDevice());
+
+      const int actual_num_fanins = node_view->node()->input_size();
+      EXPECT_EQ(actual_num_fanins, expected_node_view.node()->input_size());
+
+      const int expected_num_regular_fanins =
+          expected_node_view.NumRegularFanins();
+      bool same_num_regular_fanins =
+          node_view->NumRegularFanins() == expected_num_regular_fanins;
+      EXPECT_TRUE(same_num_regular_fanins);
+      for (int i = 0; i < expected_num_regular_fanins; ++i) {
+        const auto& expected_fanin = expected_node_view.GetRegularFanin(i);
+
+        auto* actual_fanin_node =
+            graph_view->GetNode(expected_fanin.node_view()->GetName());
+        ASSERT_NE(actual_fanin_node, nullptr);
+        EXPECT_TRUE(
+            node_view->HasFanin({actual_fanin_node, expected_fanin.index()}));
+        if (i < node_view->NumRegularFanins()) {
+          auto& actual_fanin = node_view->GetRegularFanin(i);
+          EXPECT_EQ(actual_fanin, MutableFanoutView(actual_fanin_node,
+                                                    expected_fanin.index()));
+          EXPECT_EQ(actual_fanin.node_index(),
+                    actual_fanin.node_view()->node_index());
+        }
+      }
+
+      if (same_num_regular_fanins) {
+        for (int i = 0; i < expected_num_regular_fanins; ++i) {
+          const auto& fanin = node_view->GetRegularFanin(i);
+          EXPECT_EQ(ParseTensorName(node_view->node()->input(i)),
+                    TensorId(fanin.node_view()->GetName(), fanin.index()));
+        }
+      }
+
+      const int expected_num_controlling_fanins =
+          expected_node_view.NumControllingFanins();
+      bool same_num_controlling_fanins =
+          node_view->NumControllingFanins() == expected_num_controlling_fanins;
+      EXPECT_TRUE(same_num_controlling_fanins);
+      for (int i = 0; i < expected_num_controlling_fanins; ++i) {
+        auto& expected_fanin = expected_node_view.GetControllingFanins()[i];
+
+        auto* actual_fanin_node =
+            graph_view->GetNode(expected_fanin.node_view()->GetName());
+        ASSERT_NE(actual_fanin_node, nullptr);
+        MutableFanoutView actual_fanin(actual_fanin_node,
+                                       expected_fanin.index());
+        EXPECT_TRUE(node_view->HasFanin(actual_fanin));
+
+        int found = 0;
+        for (const auto& actual_fanin : node_view->GetControllingFanins()) {
+          if (actual_fanin.index() == expected_fanin.index() &&
+              actual_fanin.node_view()->GetName() ==
+                  expected_fanin.node_view()->GetName()) {
+            EXPECT_EQ(actual_fanin.node_index(),
+                      actual_fanin.node_view()->node_index());
+            ++found;
+          }
+        }
+        EXPECT_EQ(found, 1);
+      }
+
+      if (same_num_controlling_fanins && same_num_regular_fanins) {
+        for (int i = 0; i < expected_num_controlling_fanins; ++i) {
+          const auto& fanin = node_view->GetControllingFanins()[i];
+          EXPECT_EQ(ParseTensorName(node_view->node()->input(
+                        i + expected_num_regular_fanins)),
+                    TensorId(fanin.node_view()->GetName(), fanin.index()));
+        }
+      }
+
+      EXPECT_EQ(node_view->NumRegularFanouts(),
+                expected_node_view.NumRegularFanouts());
+      const int num_output_ports =
+          expected_node_view.GetRegularFanouts().size();
+      ASSERT_EQ(node_view->GetRegularFanouts().size(), num_output_ports);
+      for (int i = 0; i < num_output_ports; ++i) {
+        auto& expected_fanouts_at_port_i = node_view->GetRegularFanouts()[i];
+        const int num_fanouts_at_port = expected_fanouts_at_port_i.size();
+
+        auto& actual_fanouts_at_port_i = node_view->GetRegularFanouts()[i];
+        EXPECT_EQ(actual_fanouts_at_port_i.size(), num_fanouts_at_port);
+
+        for (int j = 0; j < num_fanouts_at_port; ++j) {
+          auto& expected_fanout = expected_fanouts_at_port_i[j];
+
+          auto* actual_fanout_node =
+              graph_view->GetNode(expected_fanout.node_view()->GetName());
+
+          ASSERT_NE(actual_fanout_node, nullptr);
+          MutableFaninView actual_fanout(actual_fanout_node,
+                                         expected_fanout.index());
+          EXPECT_TRUE(node_view->HasFanout(actual_fanout));
+
+          int found = 0;
+          for (const auto& fanout : actual_fanouts_at_port_i) {
+            if (fanout.index() == expected_fanout.index() &&
+                fanout.node_view()->GetName() ==
+                    expected_fanout.node_view()->GetName()) {
+              EXPECT_EQ(fanout.node_index(), fanout.node_view()->node_index());
+              ++found;
+            }
+          }
+          EXPECT_EQ(found, 1);
+        }
+      }
+
+      const int num_controlled_fanouts =
+          expected_node_view.NumControlledFanouts();
+      EXPECT_EQ(node_view->NumControlledFanouts(), num_controlled_fanouts);
+      for (int i = 0; i < num_controlled_fanouts; ++i) {
+        const auto& expected_fanout =
+            expected_node_view.GetControlledFanouts()[i];
+
+        auto* actual_fanout_node =
+            graph_view->GetNode(expected_fanout.node_view()->GetName());
+        ASSERT_NE(actual_fanout_node, nullptr);
+        MutableFaninView actual_fanout(actual_fanout_node,
+                                       expected_fanout.index());
+        EXPECT_TRUE(node_view->HasFanout(actual_fanout));
+
+        int found = 0;
+        for (const auto& fanout : node_view->GetControlledFanouts()) {
+          if (fanout.index() == expected_fanout.index() &&
+              fanout.node_view()->GetName() ==
+                  expected_fanout.node_view()->GetName()) {
+            EXPECT_EQ(fanout.node_index(), fanout.node_view()->node_index());
+            ++found;
+          }
+        }
+        EXPECT_EQ(found, 1);
+      }
+
+      EXPECT_EQ(node_view->NumAttrs(), expected_node_view.NumAttrs());
+      for (const auto& expected_attr : expected_node_view.GetAttrs()) {
+        auto* attr = node_view->GetAttr(expected_attr.first);
+        EXPECT_TRUE(AreAttrValuesEqual(*attr, expected_attr.second));
+      }
+    }
+    CompareGraphs(*graph_view->graph(), expected_graph);
+  }
+};
+
+class MutationTest : public CompareGraphTest {};
+
+constexpr char kDeviceCPU0[] = "/device:CPU:0";
+constexpr char kDeviceGPU0[] = "/device:GPU:0";
+
+GraphDef SimpleTestGraphForMutation() {
+  return GDef({NDef("a", kNoOp, {}, {}, kDeviceCPU0),
+               NDef("b", kNoOp, {}, {}, kDeviceCPU0),
+               NDef("c", kNoOp, {}, {}, kDeviceCPU0),
+               NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^c", "^b"},
+                    {{"attr_1", "a"}, {"attr_2", 2.0f}}, kDeviceCPU0)},
+              /*funcs=*/{});
+}
+
+TEST_F(MutationTest, AddNewNode) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  NodeDef empty_node;
+  mutation->AddNode(std::move(empty_node), &s);
+  TF_EXPECT_OK(s);
+  s = errors::Internal("error");
+
+  NodeDef valid_node =
+      NDef("valid", "IdentityN", {"a:1", "^b"}, {{"N", 1}}, "foo");
+  mutation->AddNode(std::move(valid_node), &s);
+  TF_EXPECT_OK(s);
+
+  NodeDef bad_node_1 =
+      NDef("bad", "IdentityN", {"^b", "a:1"}, {{"N", 1}}, "foo");
+  mutation->AddNode(std::move(bad_node_1), &s);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Mutation::AddNode error: node 'bad' has regular fanin 'a:1' after "
+            "controlling fanins.");
+
+  NodeDef bad_node_2 = NDef("bad", "IdentityN", {"bad:1"}, {}, "foo");
+  mutation->AddNode(std::move(bad_node_2), &s);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Mutation::AddNode error: node 'bad' has self cycle fanin "
+            "'bad:1'.");
+
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, NewNodeBadFaninsAfterAdd) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  NodeDef valid_node =
+      NDef("valid", "IdentityN", {"a:1", "^b"}, {{"N", 1}}, "foo");
+  MutationNewNode new_node = mutation->AddNode(std::move(valid_node), &s);
+
+  mutation->AddOrUpdateRegularFanin(new_node, 1, {"valid", 2});
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: new node 'valid' is ill-formed.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, NewNodesConflictingNames) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  NodeDef new_node_1 = NDef("a", "", {});
+  mutation->AddNode(std::move(new_node_1), &s);
+  TF_EXPECT_OK(s);
+
+  NodeDef new_node_2 = NDef("a", "", {});
+  mutation->AddNode(std::move(new_node_2), &s);
+  TF_EXPECT_OK(s);
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: multiple nodes with the name: 'a' exists in "
+      "Mutation.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, UpdateNodeAndAddSelfLoop) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->AddControllingFanin(d_node, "d");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: inplace updated node 'd' is ill-formed.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, RenameNodeAndAddSelfLoop) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->UpdateNodeName(d_node, "e");
+  mutation->AddControllingFanin(d_node, "e");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: renamed updated node 'e' ('d') is ill-formed.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, ExistingNodesConflictingNames) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  mutation->UpdateNodeName(a_node, "b");
+
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  mutation->UpdateNodeOp(b_node, "Identity");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: multiple nodes with the name: 'b' exists in "
+      "Mutation.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, NewAndExistingNodesConflictingNames) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  NodeDef new_node = NDef("a", "", {});
+  mutation->AddNode(std::move(new_node), &s);
+  TF_EXPECT_OK(s);
+
+  MutableNodeView* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  mutation->UpdateNodeDevice(a_node, "foo");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: multiple nodes with the name: 'a' exists in "
+      "Mutation.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, NewAndExistingRenamedNodesConflictingNames) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  NodeDef new_node = NDef("e", "", {});
+  mutation->AddNode(std::move(new_node), &s);
+  TF_EXPECT_OK(s);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->UpdateNodeName(d_node, "e");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: multiple nodes with the name: 'e' exists in "
+      "Mutation.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, RemoveNodesWithFanouts) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  mutation->RemoveNode(b_node);
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: fanout 'd' exist for missing node 'b'.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->RemoveNode(d_node);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef({NDef("a", kNoOp, {}, {}, kDeviceCPU0),
+                                  NDef("c", kNoOp, {}, {}, kDeviceCPU0)},
+                                 /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, SwapNodeNamesWithCycle) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->UpdateNodeName(d_node, "b");
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  mutation->UpdateNodeName(b_node, "d");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: renamed updated node 'b' ('d') is ill-formed.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+
+  mutation->AddOrUpdateRegularFanin(d_node, 1, {"d", 3});
+  mutation->RemoveControllingFanin(d_node, "b");
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph =
+      GDef({NDef("a", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("d", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("c", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("b", kNoOp, {"a:2", "d:3", "a:4", "^c"},
+                 {{"attr_1", "a"}, {"attr_2", 2.0f}}, kDeviceCPU0)},
+           /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, RenamedNodeWithFanouts) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  mutation->UpdateNodeName(a_node, "b");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: fanout 'd' exist for missing node 'a'.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+
+  mutation->UpdateNodeName(a_node, "a");
+
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  mutation->UpdateNodeName(b_node, "e");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  expected_error_msg =
+      "Mutation::Apply error: fanout 'd' exist for missing "
+      "node 'b'.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+TEST_F(MutationTest, RemoveExistingNodeAndReplaceWithNewNode) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->RemoveNode(d_node);
+
+  NodeDef new_node = NDef("d", kNoOp, {"c:8", "^a"}, {}, kDeviceCPU0);
+  mutation->AddNode(std::move(new_node), &s);
+  TF_EXPECT_OK(s);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph =
+      GDef({NDef("a", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("b", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("c", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("d", kNoOp, {"c:8", "^a"}, {}, kDeviceCPU0)},
+           /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, UpdateNodeNameAndRemoveFanins) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->UpdateNodeName(d_node, "e");
+  mutation->RemoveRegularFanin(d_node, 1);
+  mutation->RemoveRegularFanin(d_node, 2);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph =
+      GDef({NDef("a", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("b", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("c", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("e", kNoOp, {"a:2", "^c", "^b"},
+                 {{"attr_1", "a"}, {"attr_2", 2.0f}}, kDeviceCPU0)},
+           /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, UpdateNodeNameAndRemoveRegularFanout) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+  mutation->UpdateNodeName(a_node, "e");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: fanout 'd' exist for missing node 'a'.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->RemoveRegularFanin(d_node, 2);
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  expected_error_msg =
+      "Mutation::Apply error: fanout 'd' exist for missing node 'a'.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+
+  mutation->AddOrUpdateRegularFanin(d_node, 0, {"b", 1});
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph =
+      GDef({NDef("e", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("b", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("c", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("d", kNoOp, {"b:1", "b:3", "^c", "^b"},
+                 {{"attr_1", "a"}, {"attr_2", 2.0f}}, kDeviceCPU0)},
+           /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, UpdateNodeNameAndRemoveControlledFanout) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  mutation->UpdateNodeName(c_node, "e");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: fanout 'd' exist for missing node 'c'.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->UpdateNodeDevice(d_node, kDeviceGPU0);
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  expected_error_msg =
+      "Mutation::Apply error: fanout 'd' exist for missing node 'c'.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+
+  mutation->RemoveControllingFanin(d_node, "c");
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph =
+      GDef({NDef("a", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("b", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("e", kNoOp, {}, {}, kDeviceCPU0),
+            NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^b"},
+                 {{"attr_1", "a"}, {"attr_2", 2.0f}}, kDeviceGPU0)},
+           /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, EmptyMutation) {
+  GraphDef graph = SimpleTestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  TF_EXPECT_OK(mutation->Apply());
+  CompareGraphViewWithGraph(&graph_view, SimpleTestGraphForMutation());
+}
+
+constexpr char kIdentity[] = "Identity";
+constexpr char kDeviceCPU1[] = "/device:CPU:1";
+constexpr char kDeviceGPU1[] = "/device:GPU:1";
+
+GraphDef TestGraphForMutation() {
+  return GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+       NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^c", "^b"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1)},
+      /*funcs=*/{});
+}
+
+TEST_F(MutationTest, SwapNodeNamesWithNoCycle) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  MutableNodeView* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+
+  mutation->UpdateNodeName(b_node, "c");
+  mutation->UpdateNodeName(c_node, "b");
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("c", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+       NDef("b", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^c", "^b"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, RemoveMultipleDependentNodes) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  mutation->RemoveNode(c_node);
+  mutation->RemoveNode(d_node);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+constexpr char kDeviceGPU2[] = "/device:GPU:2";
+
+TEST_F(MutationTest, AddSimpleNewNode) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  NodeDef new_node =
+      NDef("new_node", kIdentity, {}, {{"T", DT_INT64}}, kDeviceGPU2);
+  mutation->AddNode(std::move(new_node), &s);
+  TF_EXPECT_OK(s);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+       NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^c", "^b"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1),
+       NDef("new_node", kIdentity, {}, {{"T", DT_INT64}}, kDeviceGPU2)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+constexpr char kDeviceGPU3[] = "/device:GPU:3";
+
+TEST_F(MutationTest, AddAndUpdateNodesWithFanins) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  NodeDef new_node_1 = NDef("new_node_1", kNoOp, {"a:2", "d:5", "^b", "^c"},
+                            {{"new_node_1_attr_1", 5.0f}}, kDeviceGPU2);
+  mutation->AddNode(std::move(new_node_1), &s);
+  TF_EXPECT_OK(s);
+
+  NodeDef new_node_2 =
+      NDef("new_node_2", kNoOp, {"a:3", "new_node_1:5", "^d", "^new_node_1"},
+           {{"new_node_2_attr_1", 9}}, kDeviceGPU3);
+  mutation->AddNode(std::move(new_node_2), &s);
+  TF_EXPECT_OK(s);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+  mutation->AddOrUpdateRegularFanin(d_node, 3, {"c", 6});
+  mutation->AddOrUpdateRegularFanin(d_node, 1, {"new_node_1", 5});
+  mutation->AddControllingFanin(d_node, "new_node_2");
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+       NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("d", kNoOp,
+            {"a:2", "new_node_1:5", "a:4", "c:6", "^c", "^b", "^new_node_2"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1),
+       NDef("new_node_1", kNoOp, {"a:2", "d:5", "^b", "^c"},
+            {{"new_node_1_attr_1", 5.0f}}, kDeviceGPU2),
+       NDef("new_node_2", kNoOp, {"a:3", "new_node_1:5", "^d", "^new_node_1"},
+            {{"new_node_2_attr_1", 9}}, kDeviceGPU3)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, UpdateNodeNameToReplaceExistingNode) {
+  auto test_graph = []() {
+    return GDef(
+        {NDef("a", kNoOp, {}, {{"attr_a", 8}}, kDeviceCPU0),
+         NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU1),
+         NDef("c", kNoOp, {"b:4", "^a"}, {{"attr_c", "test"}}, kDeviceGPU2),
+         NDef("d", kNoOp, {"a:2", "c:5", "a:4", "^a", "^c"},
+              {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU3)},
+        /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+
+  mutation->UpdateNodeName(b_node, "c");
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph =
+      GDef({NDef("a", kNoOp, {}, {{"attr_a", 8}}, kDeviceCPU0),
+            NDef("c", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU1),
+            NDef("d", kNoOp, {"a:2", "c:5", "a:4", "^a", "^c"},
+                 {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU3)},
+           /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, NewNodeWithMutations) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  NodeDef new_node_def = NDef("node", kNoOp, {"a:2", "b:3", "^c"},
+                              {{"attr_1", 1}, {"attr_2", 2.0f}}, kDeviceGPU3);
+  MutationNewNode new_node = mutation->AddNode(std::move(new_node_def), &s);
+  TF_EXPECT_OK(s);
+
+  mutation->AddControllingFanin(new_node, "a");
+  mutation->RemoveControllingFanin(new_node, "c");
+  mutation->AddOrUpdateRegularFanin(new_node, 0, {"b", 6});
+  mutation->RemoveRegularFanin(new_node, 1);
+  mutation->UpdateNodeName(new_node, "new_node");
+  mutation->UpdateNodeOp(new_node, kIdentity);
+  mutation->UpdateNodeDevice(new_node, kDeviceGPU2);
+  AttrValue attr_3;
+  attr_3.set_s("new_node_attr");
+  mutation->AddOrUpdateNodeAttr(new_node, "attr_3", attr_3);
+  AttrValue attr_1;
+  attr_1.set_b(true);
+  mutation->AddOrUpdateNodeAttr(new_node, "attr_1", attr_1);
+  mutation->RemoveNodeAttr(new_node, "attr_2");
+  AttrValue attr_4;
+  attr_4.set_type(DT_FLOAT);
+  mutation->AddOrUpdateNodeAttr(new_node, "T", attr_4);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+       NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^c", "^b"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1),
+       NDef("new_node", kIdentity, {"b:6", "^a"},
+            {{"attr_1", true}, {"attr_3", "new_node_attr"}, {"T", DT_FLOAT}},
+            kDeviceGPU2)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, UpdatedNodeWithNonFaninMutations) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  mutation->UpdateNodeName(d_node, "e");
+  mutation->UpdateNodeOp(d_node, kIdentity);
+  mutation->UpdateNodeDevice(d_node, kDeviceGPU2);
+  AttrValue attr_d_1;
+  attr_d_1.set_b(false);
+  mutation->AddOrUpdateNodeAttr(d_node, "attr_d_1", attr_d_1);
+  AttrValue attr_e_3;
+  attr_e_3.set_s("test_string");
+  mutation->AddOrUpdateNodeAttr(d_node, "attr_e_3", attr_e_3);
+  mutation->RemoveNodeAttr(d_node, "attr_d_2");
+  AttrValue attr_e_4;
+  attr_e_4.set_type(DT_INT64);
+  mutation->AddOrUpdateNodeAttr(d_node, "T", attr_e_4);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+       NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("e", kIdentity, {"a:2", "b:3", "a:4", "^c", "^b"},
+            {{"attr_d_1", false}, {"attr_e_3", "test_string"}, {"T", DT_INT64}},
+            kDeviceGPU2)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, Reset) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  mutation->UpdateNodeName(a_node, "e");
+  mutation->AddNode({}, &s);
+  TF_EXPECT_OK(s);
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  string expected_error_msg =
+      "Mutation::Apply error: fanout 'b' exist for missing node 'a'.";
+  EXPECT_EQ(s.error_message(), expected_error_msg);
+  CompareGraphViewWithGraph(&graph_view, TestGraphForMutation());
+
+  mutation->Reset();
+  TF_EXPECT_OK(mutation->Apply());
+  CompareGraphViewWithGraph(&graph_view, TestGraphForMutation());
+}
+
+TEST_F(MutationTest, RenameNodeAndAddNewNodeWithRenamedNodeOldName) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  mutation->UpdateNodeName(b_node, "e");
+
+  NodeDef new_node =
+      NDef("b", kIdentity, {"c:2"}, {{"T", DT_INT64}}, kDeviceGPU3);
+  mutation->AddNode(std::move(new_node), &s);
+  TF_EXPECT_OK(s);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("e", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+       NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^c", "^b"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1),
+       NDef("b", kIdentity, {"c:2"}, {{"T", DT_INT64}}, kDeviceGPU3)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, ShiftNodesWithFanouts) {
+  auto test_graph = []() {
+    return GDef({NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^a", "^c", "^b"},
+                      {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1),
+                 NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+                 NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+                 NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}},
+                      kDeviceGPU0)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* c_node = graph_view.GetNode("c");
+  ASSERT_NE(c_node, nullptr);
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  mutation->RemoveControllingFanin(d_node, "c");
+  mutation->RemoveNode(c_node);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("d", kNoOp, {"a:2", "b:3", "a:4", "^a", "^b"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1),
+       NDef("b", kNoOp, {"a:2"}, {{"attr_b", 3.0f}}, kDeviceCPU0),
+       NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, RemoveFaninFanoutAndShiftFanout) {
+  auto test_graph = []() {
+    return GDef({NDef("a", kNoOp, {}, {}, kDeviceGPU0),
+                 NDef("b", kNoOp, {"a:2", "a:1"}, {}, kDeviceGPU1),
+                 NDef("c", kNoOp, {"a:1", "a:2"}, {}, kDeviceGPU2)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  mutation->RemoveRegularFanin(b_node, 1);
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph =
+      GDef({NDef("a", kNoOp, {}, {}, kDeviceGPU0),
+            NDef("b", kNoOp, {"a:2"}, {}, kDeviceGPU1),
+            NDef("c", kNoOp, {"a:1", "a:2"}, {}, kDeviceGPU2)},
+           /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+TEST_F(MutationTest, ConsecutiveMutations) {
+  GraphDef graph = TestGraphForMutation();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* b_node = graph_view.GetNode("b");
+  ASSERT_NE(b_node, nullptr);
+  MutableNodeView* d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  mutation->RemoveNode(b_node);
+  mutation->AddOrUpdateRegularFanin(d_node, 1, {"c", 5});
+  mutation->RemoveControllingFanin(d_node, "b");
+
+  NodeDef new_node_1 = NDef("new_node_1", kIdentity, {"a:3", "d:5", "^d"},
+                            {{"T", DT_FLOAT}}, kDeviceGPU2);
+  MutationNewNode new_node_1_node =
+      mutation->AddNode(std::move(new_node_1), &s);
+  TF_EXPECT_OK(s);
+
+  mutation->AddOrUpdateRegularFanin(new_node_1_node, 0, {"c", 5});
+  mutation->RemoveRegularFanin(new_node_1_node, 1);
+  mutation->AddOrUpdateRegularFanin(new_node_1_node, 1, {"a", 6});
+  mutation->AddControllingFanin(new_node_1_node, "a");
+  mutation->RemoveControllingFanin(new_node_1_node, "d");
+
+  TF_EXPECT_OK(mutation->Apply());
+  GraphDef expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("d", kNoOp, {"a:2", "c:5", "a:4", "^c"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1),
+       NDef("new_node_1", kIdentity, {"c:5", "a:6", "^a"}, {{"T", DT_FLOAT}},
+            kDeviceGPU2)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+
+  d_node = graph_view.GetNode("d");
+  ASSERT_NE(d_node, nullptr);
+
+  mutation->AddOrUpdateRegularFanin(d_node, 3, {"new_node_2", 6});
+  mutation->AddOrUpdateRegularFanin(d_node, 1, {"new_node_1", 8});
+  mutation->AddControllingFanin(d_node, "new_node_2");
+  mutation->AddControllingFanin(d_node, "a");
+  mutation->RemoveControllingFanin(d_node, "c");
+
+  NodeDef new_node_2 =
+      NDef("new_node_2", kNoOp, {"c:4", "new_node_1:5", "^d", "^c"});
+  MutationNewNode new_node_2_node =
+      mutation->AddNode(std::move(new_node_2), &s);
+  TF_EXPECT_OK(s);
+
+  mutation->UpdateNodeDevice(new_node_2_node, kDeviceGPU3);
+  mutation->AddOrUpdateRegularFanin(new_node_2_node, 0, {"new_node_1", 4});
+  mutation->RemoveRegularFanin(new_node_2_node, 1);
+  mutation->RemoveControllingFanin(new_node_2_node, "c");
+  mutation->AddControllingFanin(new_node_2_node, "a");
+  mutation->AddControllingFanin(new_node_2_node, "new_node_1");
+
+  TF_EXPECT_OK(mutation->Apply());
+  expected_graph = GDef(
+      {NDef("a", kIdentity, {}, {{"attr_a", 8}, {"T", DT_FLOAT}}, kDeviceGPU0),
+       NDef("c", kNoOp, {"^a"}, {{"attr_c", "test"}}, kDeviceCPU1),
+       NDef("d", kNoOp,
+            {"a:2", "new_node_1:8", "a:4", "new_node_2:6", "^new_node_2", "^a"},
+            {{"attr_d_1", "a"}, {"attr_d_2", 2.0f}}, kDeviceGPU1),
+       NDef("new_node_1", kIdentity, {"c:5", "a:6", "^a"}, {{"T", DT_FLOAT}},
+            kDeviceGPU2),
+       NDef("new_node_2", kNoOp, {"new_node_1:4", "^d", "^a", "^new_node_1"},
+            {}, kDeviceGPU3)},
+      /*funcs=*/{});
+  CompareGraphViewWithGraph(&graph_view, expected_graph);
+}
+
+constexpr char kMatchingFiles[] = "MatchingFiles";
+
+TEST_F(MutationTest, OpWithUnsupportedDevice) {
+  auto test_graph = []() {
+    return GDef({NDef("a", kMatchingFiles, {}, {}, kDeviceCPU0)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  // Unsupported device.
+  mutation->UpdateNodeDevice(a_node, kDeviceGPU1);
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+
+  mutation->Reset();
+
+  // New node with unsupported device.
+  NodeDef new_node = NDef("new_node", kMatchingFiles, {}, {}, kDeviceGPU2);
+  mutation->AddNode(std::move(new_node), &s);
+  TF_EXPECT_OK(s);
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+}
+
+TEST_F(MutationTest, OpMissingAttribute) {
+  auto test_graph = []() {
+    return GDef({NDef("a", kIdentity, {}, {{"T", DT_FLOAT}}, kDeviceGPU0)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  // Remove necessary attribute.
+  mutation->RemoveNodeAttr(a_node, "T");
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+
+  mutation->Reset();
+
+  // New node without necessary attribute.
+  NodeDef new_node = NDef("new_node", kIdentity, {}, {}, kDeviceGPU2);
+  mutation->AddNode(std::move(new_node), &s);
+  TF_EXPECT_OK(s);
+
+  s = mutation->Apply();
+  EXPECT_FALSE(s.ok());
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+}
+
+TEST_F(MutationTest, EmptyMutationUpdateIndexPersisting) {
+  auto test_graph = []() {
+    return GDef({NDef("a", kIdentity, {}, {{"T", DT_FLOAT}}, kDeviceGPU0)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+
+  Status s;
+  MutableGraphView graph_view(&graph, &s);
+  TF_ASSERT_OK(s);
+
+  MutableNodeView* a_node = graph_view.GetNode("a");
+  ASSERT_NE(a_node, nullptr);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+
+  // Empty MutableNodeViewDiff.
+  mutation->UpdateNodeName(a_node, "a");
+
+  TF_EXPECT_OK(mutation->Apply());
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+
+  mutation->Reset();
+
+  // Empty MutableNodeViewDiff, `update_index_` should not persist.
+  mutation->UpdateNodeName(a_node, "a");
+
+  TF_EXPECT_OK(mutation->Apply());
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+}
+
+class TopologicalSortTest : public CompareGraphTest {
+ protected:
+  void CompareGraphOrder(const MutableGraphView& graph_view,
+                         absl::Span<const string> node_names) {
+    const int num_nodes = graph_view.NumNodes();
+    ASSERT_EQ(num_nodes, node_names.size());
+    for (int i = 0; i < num_nodes; ++i) {
+      EXPECT_EQ(graph_view.GetNode(i)->GetName(), node_names[i]);
+    }
+  }
+
+  void CompareGraphNodePrecedences(
+      const MutableGraphView& graph_view,
+      absl::Span<const std::pair<string, string>> node_precedences) {
+    for (const auto& node_precedence : node_precedences) {
+      auto* parent_node = graph_view.GetNode(node_precedence.first);
+      ASSERT_NE(parent_node, nullptr);
+      auto* child_node = graph_view.GetNode(node_precedence.second);
+      ASSERT_NE(child_node, nullptr);
+      EXPECT_TRUE(parent_node->node_index() < child_node->node_index());
+    }
+  }
+};
+
+TEST_F(TopologicalSortTest, ActiveMutationSort) {
+  auto test_graph = []() {
+    return GDef({NDef("a", kIdentity, {}, {{"T", DT_FLOAT}}, kDeviceGPU0),
+                 NDef("b", kIdentity, {"a"}, {{"T", DT_FLOAT}}, kDeviceGPU1)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  Mutation* mutation = graph_view.GetMutationBuilder();
+  mutation->AddNode({}, &status);
+  TF_ASSERT_OK(status);
+
+  for (bool ignore_cycles : {false, true}) {
+    status = graph_view.SortTopologically(ignore_cycles, {});
+    EXPECT_FALSE(status.ok());
+    EXPECT_EQ(
+        status.error_message(),
+        "MutableGraphView::SortTopologically error: active mutation exists.");
+    CompareGraphViewWithGraph(&graph_view, test_graph());
+    CompareGraphOrder(graph_view, {"a", "b"});
+  }
+}
+
+TEST_F(TopologicalSortTest, BadExtraDependenciesSort) {
+  auto test_graph = []() {
+    return GDef({NDef("a", kIdentity, {}, {{"T", DT_FLOAT}}, kDeviceGPU0),
+                 NDef("b", kIdentity, {}, {{"T", DT_FLOAT}}, kDeviceGPU1)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph_1 = test_graph();
+  Status status;
+  MutableGraphView graph_view_1(&graph_1, &status);
+  TF_ASSERT_OK(status);
+  MutableNodeView* a_node_1 = graph_view_1.GetNode("a");
+
+  GraphDef graph_2 = test_graph();
+  MutableGraphView graph_view_2(&graph_2, &status);
+  TF_ASSERT_OK(status);
+  MutableNodeView* b_node_2 = graph_view_2.GetNode("b");
+
+  for (bool ignore_cycles : {false, true}) {
+    status =
+        graph_view_2.SortTopologically(ignore_cycles, {{a_node_1, b_node_2}});
+    EXPECT_FALSE(status.ok());
+    EXPECT_EQ(status.error_message(),
+              "MutableGraphView::SortTopologically error: invalid extra "
+              "dependencies.");
+    CompareGraphViewWithGraph(&graph_view_2, test_graph());
+    CompareGraphOrder(graph_view_2, {"a", "b"});
+  }
+}
+
+TEST_F(TopologicalSortTest, NoCyclesAllowed) {
+  auto test_graph = []() {
+    return GDef(
+        {NDef("a", kIdentity, {}, {{"T", DT_FLOAT}}, kDeviceGPU0),
+         NDef("b", kIdentity, {"a", "c"}, {{"T", DT_FLOAT}}, kDeviceGPU1),
+         NDef("c", kIdentity, {"b"}, {{"T", DT_FLOAT}}, kDeviceGPU1)},
+        /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  status = graph_view.SortTopologically(/*ignore_cycles=*/false, {});
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(status.error_message(),
+            "MutableGraphView::SortTopologically error: detected edge(s) "
+            "creating cycle(s) {'c' -> 'b'}.");
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphOrder(graph_view, {"a", "b", "c"});
+
+  TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/true, {}));
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphNodePrecedences(graph_view, {{"a", "b"}, {"a", "c"}});
+}
+
+TEST_F(TopologicalSortTest, NoNodesWithZeroFanins) {
+  auto test_graph = []() {
+    return GDef({NDef("a", kIdentity, {"b"}, {{"T", DT_FLOAT}}, kDeviceGPU0),
+                 NDef("b", kIdentity, {"a"}, {{"T", DT_FLOAT}}, kDeviceGPU1)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  status = graph_view.SortTopologically(/*ignore_cycles=*/false, {});
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(status.error_message(),
+            "MutableGraphView::SortTopologically error: was not able to sort "
+            "all nodes topologically.");
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphOrder(graph_view, {"a", "b"});
+
+  TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/true, {}));
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+}
+
+TEST_F(TopologicalSortTest, DidNotReachAllNodes) {
+  auto test_graph = []() {
+    return GDef({NDef("c", kIdentity, {}, {{"T", DT_FLOAT}}, kDeviceGPU2),
+                 NDef("a", kIdentity, {"b"}, {{"T", DT_FLOAT}}, kDeviceGPU0),
+                 NDef("b", kIdentity, {"a"}, {{"T", DT_FLOAT}}, kDeviceGPU1)},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  status = graph_view.SortTopologically(/*ignore_cycles=*/false, {});
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(status.error_message(),
+            "MutableGraphView::SortTopologically error: was not able to sort "
+            "all nodes topologically.");
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphOrder(graph_view, {"c", "a", "b"});
+
+  TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/true, {}));
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphOrder(graph_view, {"a", "b", "c"});
+}
+
+TEST_F(TopologicalSortTest, NoLoopGraph) {
+  auto test_graph = []() {
+    return GDef({NDef("c", kIdentity, {"f"}), NDef("a", kIdentity, {"f", "e"}),
+                 NDef("b", kIdentity, {"e", "d"}), NDef("d", kIdentity, {"c"}),
+                 NDef("f", kIdentity, {}), NDef("e", kIdentity, {})},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/false, {}));
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphNodePrecedences(
+      graph_view,
+      {{"f", "a"}, {"f", "c"}, {"e", "a"}, {"e", "b"}, {"c", "d"}, {"d", "b"}});
+}
+
+TEST_F(TopologicalSortTest, ValidLoopGraph) {
+  // NextIteration -> Merge loop.
+  auto test_graph = []() {
+    return GDef({NDef("b", "Merge", {"a", "e"}), NDef("c", "Switch", {"b"}),
+                 NDef("d", kIdentity, {"c"}), NDef("e", "NextIteration", {"d"}),
+                 NDef("a", "Const", {})},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/false, {}));
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphOrder(graph_view, {"a", "b", "c", "d", "e"});
+}
+
+TEST_F(TopologicalSortTest, DuplicateFanins) {
+  auto test_graph = []() {
+    return GDef(
+        {NDef("b", kIdentity, {"a", "a", "^a"}), NDef("a", "Const", {})},
+        /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/false, {}));
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphOrder(graph_view, {"a", "b"});
+}
+
+TEST_F(TopologicalSortTest, DiamondDependencyNotACycle) {
+  auto test_graph = []() {
+    return GDef({NDef("e", kIdentity, {"b", "c", "d"}),
+                 NDef("b", kIdentity, {"a"}), NDef("a", "Const", {}),
+                 NDef("d", kIdentity, {"a"}), NDef("c", kIdentity, {"a"})},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/false, {}));
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphNodePrecedences(
+      graph_view,
+      {{"a", "b"}, {"a", "c"}, {"a", "d"}, {"b", "e"}, {"c", "e"}, {"d", "e"}});
+}
+
+TEST_F(TopologicalSortTest, ExtraDependencies) {
+  auto test_graph = []() {
+    return GDef({NDef("c", kIdentity, {"f"}), NDef("a", kIdentity, {"f", "e"}),
+                 NDef("b", kIdentity, {"e", "d"}), NDef("d", kIdentity, {"c"}),
+                 NDef("f", kIdentity, {}), NDef("e", kIdentity, {})},
+                /*funcs=*/{});
+  };
+
+  GraphDef graph = test_graph();
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  auto* e_node = graph_view.GetNode("e");
+  ASSERT_NE(e_node, nullptr);
+  auto* f_node = graph_view.GetNode("f");
+  ASSERT_NE(f_node, nullptr);
+
+  TF_EXPECT_OK(
+      graph_view.SortTopologically(/*ignore_cycles=*/true, {{e_node, f_node}}));
+  CompareGraphViewWithGraph(&graph_view, test_graph());
+  CompareGraphNodePrecedences(graph_view, {{"f", "a"},
+                                           {"f", "c"},
+                                           {"e", "a"},
+                                           {"e", "b"},
+                                           {"c", "d"},
+                                           {"d", "b"},
+                                           {"e", "f"}});
+}
+
+#define RUN_NUM_NODE_NUM_EDGE_BENCHMARK(name) \
+  BENCHMARK(name)                             \
+      ->ArgPair(10, 2)                        \
+      ->ArgPair(100, 2)                       \
+      ->ArgPair(1000, 2)                      \
+      ->ArgPair(10000, 2)                     \
+      ->ArgPair(25000, 2)                     \
+      ->ArgPair(50000, 2)                     \
+      ->ArgPair(100000, 2)                    \
+      ->ArgPair(10, 4)                        \
+      ->ArgPair(100, 4)                       \
+      ->ArgPair(1000, 4)                      \
+      ->ArgPair(10000, 4)                     \
+      ->ArgPair(25000, 4)                     \
+      ->ArgPair(50000, 4)                     \
+      ->ArgPair(100000, 4)                    \
+      ->ArgPair(10, 8)                        \
+      ->ArgPair(100, 8)                       \
+      ->ArgPair(1000, 8)                      \
+      ->ArgPair(10000, 8)                     \
+      ->ArgPair(25000, 8)                     \
+      ->ArgPair(50000, 8)                     \
+      ->ArgPair(100000, 8)                    \
+      ->ArgPair(10, 16)                       \
+      ->ArgPair(100, 16)                      \
+      ->ArgPair(1000, 16)                     \
+      ->ArgPair(10000, 16)                    \
+      ->ArgPair(25000, 16)                    \
+      ->ArgPair(50000, 16)                    \
+      ->ArgPair(100000, 16);
+
+template <typename GraphViewT>
+static void BM_GraphViewTConstruction(int iters, int num_nodes,
+                                      int num_edges_per_node) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateGraphDef(num_nodes, num_edges_per_node);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    Status s;
+    GraphViewT graph_view(&graph_def, &s);
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewConstruction(int iters, int num_nodes,
+                                     int num_edges_per_node) {
+  BM_GraphViewTConstruction<GraphView>(iters, num_nodes, num_edges_per_node);
+}
+
+static void BM_MutableGraphViewConstruction(int iters, int num_nodes,
+                                            int num_edges_per_node) {
+  BM_GraphViewTConstruction<MutableGraphView>(iters, num_nodes,
+                                              num_edges_per_node);
+}
+
+RUN_NUM_NODE_NUM_EDGE_BENCHMARK(BM_GraphViewConstruction);
+RUN_NUM_NODE_NUM_EDGE_BENCHMARK(BM_MutableGraphViewConstruction);
+
+#define RUN_NUM_NODE_BENCHMARK(name) \
+  BENCHMARK(name)                    \
+      ->Arg(10)                      \
+      ->Arg(100)                     \
+      ->Arg(1000)                    \
+      ->Arg(10000)                   \
+      ->Arg(25000)                   \
+      ->Arg(50000)                   \
+      ->Arg(100000);
+
+template <typename GraphViewT>
+static void BM_GraphViewTConstructionWithControlDependencies(
+    int iters, int num_fanins_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def =
+      test::CreateFaninFanoutNodeGraph(num_fanins_fanouts, num_fanins_fanouts,
+                                       num_fanins_fanouts, num_fanins_fanouts,
+                                       /*fanout_unique_index=*/true);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    Status s;
+    GraphViewT graph_view(&graph_def, &s);
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewConstructionWithControlDependencies(
+    int iters, int num_fanins_fanouts) {
+  BM_GraphViewTConstructionWithControlDependencies<GraphView>(
+      iters, num_fanins_fanouts);
+}
+
+static void BM_MutableGraphViewConstructionWithControlDependencies(
+    int iters, int num_fanins_fanouts) {
+  BM_GraphViewTConstructionWithControlDependencies<MutableGraphView>(
+      iters, num_fanins_fanouts);
+}
+
+RUN_NUM_NODE_BENCHMARK(BM_GraphViewConstructionWithControlDependencies);
+RUN_NUM_NODE_BENCHMARK(BM_MutableGraphViewConstructionWithControlDependencies);
+
+template <typename GraphViewT>
+static void BM_GraphViewTGetNode(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def =
+      test::CreateGraphDef(num_nodes, /*num_edges_per_node=*/16);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    graph_view.GetNode("out");
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewGetNode(int iters, int num_nodes) {
+  BM_GraphViewTGetNode<GraphView>(iters, num_nodes);
+}
+
+static void BM_MutableGraphViewGetNode(int iters, int num_nodes) {
+  BM_GraphViewTGetNode<MutableGraphView>(iters, num_nodes);
+}
+
+RUN_NUM_NODE_BENCHMARK(BM_GraphViewGetNode);
+RUN_NUM_NODE_BENCHMARK(BM_MutableGraphViewGetNode);
+
+#define RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(name) \
+  BENCHMARK(name)                                \
+      ->ArgPair(10, 10)                          \
+      ->ArgPair(10, 100)                         \
+      ->ArgPair(10, 1000)                        \
+      ->ArgPair(10, 10000)                       \
+      ->ArgPair(10, 100000)                      \
+      ->ArgPair(100, 10)                         \
+      ->ArgPair(100, 100)                        \
+      ->ArgPair(100, 1000)                       \
+      ->ArgPair(100, 10000)                      \
+      ->ArgPair(100, 100000)                     \
+      ->ArgPair(1000, 10)                        \
+      ->ArgPair(1000, 100)                       \
+      ->ArgPair(1000, 1000)                      \
+      ->ArgPair(1000, 10000)                     \
+      ->ArgPair(1000, 100000)                    \
+      ->ArgPair(10000, 10)                       \
+      ->ArgPair(10000, 100)                      \
+      ->ArgPair(10000, 1000)                     \
+      ->ArgPair(10000, 10000)                    \
+      ->ArgPair(10000, 100000)                   \
+      ->ArgPair(100000, 10)                      \
+      ->ArgPair(100000, 100)                     \
+      ->ArgPair(100000, 1000)                    \
+      ->ArgPair(100000, 10000)                   \
+      ->ArgPair(100000, 100000);
+
+template <typename GraphViewT>
+static void BM_GraphViewTGetRegularFanin(int iters, int num_fanins,
+                                         int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, num_fanins, num_fanouts,
+      /*fanout_unique_index=*/true);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    auto* node = graph_view.GetNode("node");
+    node->GetRegularFanin(0);
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewGetRegularFanin(int iters, int num_fanins,
+                                        int num_fanouts) {
+  BM_GraphViewTGetRegularFanin<GraphView>(iters, num_fanins, num_fanouts);
+}
+
+static void BM_MutableGraphViewGetRegularFanin(int iters, int num_fanins,
+                                               int num_fanouts) {
+  BM_GraphViewTGetRegularFanin<MutableGraphView>(iters, num_fanins,
+                                                 num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanin);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetRegularFanin);
+
+template <typename GraphViewT>
+static void BM_GraphViewTGetRegularFanout(int iters, int num_fanins,
+                                          int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, num_fanins, num_fanouts,
+      /*fanout_unique_index=*/true);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    auto* node = graph_view.GetNode("node");
+    node->GetRegularFanout(0);
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewGetRegularFanout(int iters, int num_fanins,
+                                         int num_fanouts) {
+  BM_GraphViewTGetRegularFanout<GraphView>(iters, num_fanins, num_fanouts);
+}
+
+static void BM_MutableGraphViewGetRegularFanout(int iters, int num_fanins,
+                                                int num_fanouts) {
+  BM_GraphViewTGetRegularFanout<MutableGraphView>(iters, num_fanins,
+                                                  num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanout);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetRegularFanout);
+
+template <typename GraphViewT>
+static void BM_GraphViewTGetRegularFanins(int iters, int num_fanins,
+                                          int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, num_fanins, num_fanouts,
+      /*fanout_unique_index=*/true);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    auto* node = graph_view.GetNode("node");
+    node->GetRegularFanins();
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewGetRegularFanins(int iters, int num_fanins,
+                                         int num_fanouts) {
+  BM_GraphViewTGetRegularFanins<GraphView>(iters, num_fanins, num_fanouts);
+}
+
+static void BM_MutableGraphViewGetRegularFanins(int iters, int num_fanins,
+                                                int num_fanouts) {
+  BM_GraphViewTGetRegularFanins<MutableGraphView>(iters, num_fanins,
+                                                  num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanins);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetRegularFanins);
+
+template <typename GraphViewT>
+static void BM_GraphViewTGetRegularFanouts(int iters, int num_fanins,
+                                           int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, num_fanins, num_fanouts,
+      /*fanout_unique_index=*/true);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    auto* node = graph_view.GetNode("node");
+    node->GetRegularFanouts();
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewGetRegularFanouts(int iters, int num_fanins,
+                                          int num_fanouts) {
+  BM_GraphViewTGetRegularFanouts<GraphView>(iters, num_fanins, num_fanouts);
+}
+
+static void BM_MutableGraphViewGetRegularFanouts(int iters, int num_fanins,
+                                                 int num_fanouts) {
+  BM_GraphViewTGetRegularFanouts<MutableGraphView>(iters, num_fanins,
+                                                   num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetRegularFanouts);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetRegularFanouts);
+
+template <typename GraphViewT>
+static void BM_GraphViewTGetControllingFanins(int iters, int num_fanins,
+                                              int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, num_fanins, num_fanouts,
+      /*fanout_unique_index=*/true);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    auto* node = graph_view.GetNode("node");
+    node->GetControllingFanins();
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewGetControllingFanins(int iters, int num_fanins,
+                                             int num_fanouts) {
+  BM_GraphViewTGetControllingFanins<GraphView>(iters, num_fanins, num_fanouts);
+}
+
+static void BM_MutableGraphViewGetControllingFanins(int iters, int num_fanins,
+                                                    int num_fanouts) {
+  BM_GraphViewTGetControllingFanins<MutableGraphView>(iters, num_fanins,
+                                                      num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetControllingFanins);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetControllingFanins);
+
+template <typename GraphViewT>
+static void BM_GraphViewTGetControlledFanouts(int iters, int num_fanins,
+                                              int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, num_fanins, num_fanouts,
+      /*fanout_unique_index=*/true);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    auto* node = graph_view.GetNode("node");
+    node->GetControlledFanouts();
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewGetControlledFanouts(int iters, int num_fanins,
+                                             int num_fanouts) {
+  BM_GraphViewTGetControlledFanouts<GraphView>(iters, num_fanins, num_fanouts);
+}
+
+static void BM_MutableGraphViewGetControlledFanouts(int iters, int num_fanins,
+                                                    int num_fanouts) {
+  BM_GraphViewTGetControlledFanouts<MutableGraphView>(iters, num_fanins,
+                                                      num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewGetControlledFanouts);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewGetControlledFanouts);
+
+template <typename GraphViewT, bool IsLast>
+inline static void BM_GraphViewTHasRegularFanin(int iters, int num_fanins,
+                                                int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, /*num_controlling_fanins=*/0,
+      /*num_controlled_fanouts=*/0, /*fanout_unique_index=*/false);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+  const int index = IsLast ? num_fanouts - 1 : 0;
+  auto* node = graph_view.GetNode(absl::StrFormat("out%05d", index));
+  auto* fanin = graph_view.GetNode("node");
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    node->HasFanin({&graph_view, fanin->node_index(), 0});
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewHasRegularFaninFirst(int iters, int num_fanins,
+                                             int num_fanouts) {
+  BM_GraphViewTHasRegularFanin<GraphView, false>(iters, num_fanins,
+                                                 num_fanouts);
+}
+
+static void BM_GraphViewHasRegularFaninLast(int iters, int num_fanins,
+                                            int num_fanouts) {
+  BM_GraphViewTHasRegularFanin<GraphView, true>(iters, num_fanins, num_fanouts);
+}
+
+static void BM_MutableGraphViewHasRegularFaninFirst(int iters, int num_fanins,
+                                                    int num_fanouts) {
+  BM_GraphViewTHasRegularFanin<MutableGraphView, false>(iters, num_fanins,
+                                                        num_fanouts);
+}
+
+static void BM_MutableGraphViewHasRegularFaninLast(int iters, int num_fanins,
+                                                   int num_fanouts) {
+  BM_GraphViewTHasRegularFanin<MutableGraphView, true>(iters, num_fanins,
+                                                       num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasRegularFaninFirst);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasRegularFaninLast);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasRegularFaninFirst);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasRegularFaninLast);
+
+template <typename GraphViewT, bool IsLast>
+inline static void BM_GraphViewTHasControllingFanin(int iters, int num_fanins,
+                                                    int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, num_fanins, num_fanouts,
+      /*fanout_unique_index=*/true);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+  const int index = IsLast ? num_fanouts - 1 : 0;
+  auto* node = graph_view.GetNode(absl::StrFormat("control_out%05d", index));
+  auto* fanin = graph_view.GetNode("node");
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    node->HasFanin({&graph_view, fanin->node_index(), Graph::kControlSlot});
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewHasControllingFaninFirst(int iters, int num_fanins,
+                                                 int num_fanouts) {
+  BM_GraphViewTHasControllingFanin<GraphView, false>(iters, num_fanins,
+                                                     num_fanouts);
+}
+
+static void BM_GraphViewHasControllingFaninLast(int iters, int num_fanins,
+                                                int num_fanouts) {
+  BM_GraphViewTHasControllingFanin<GraphView, true>(iters, num_fanins,
+                                                    num_fanouts);
+}
+
+static void BM_MutableGraphViewHasControllingFaninFirst(int iters,
+                                                        int num_fanins,
+                                                        int num_fanouts) {
+  BM_GraphViewTHasControllingFanin<MutableGraphView, false>(iters, num_fanins,
+                                                            num_fanouts);
+}
+
+static void BM_MutableGraphViewHasControllingFaninLast(int iters,
+                                                       int num_fanins,
+                                                       int num_fanouts) {
+  BM_GraphViewTHasControllingFanin<MutableGraphView, true>(iters, num_fanins,
+                                                           num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasControllingFaninFirst);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasControllingFaninLast);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasControllingFaninFirst);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasControllingFaninLast);
+
+template <typename GraphViewT, bool IsLast>
+inline static void BM_GraphViewTHasRegularFanout(int iters, int num_fanins,
+                                                 int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, /*num_controlling_fanins=*/0,
+      /*num_controlled_fanouts=*/0, /*fanout_unique_index=*/false);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+  const int index = IsLast ? num_fanins - 1 : 0;
+  auto* node = graph_view.GetNode(absl::StrFormat("in%05d", index));
+  auto* fanout = graph_view.GetNode("node");
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    node->HasFanout({&graph_view, fanout->node_index(), index});
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewHasRegularFanoutFirst(int iters, int num_fanins,
+                                              int num_fanouts) {
+  BM_GraphViewTHasRegularFanout<GraphView, false>(iters, num_fanins,
+                                                  num_fanouts);
+}
+
+static void BM_GraphViewHasRegularFanoutLast(int iters, int num_fanins,
+                                             int num_fanouts) {
+  BM_GraphViewTHasRegularFanout<GraphView, true>(iters, num_fanins,
+                                                 num_fanouts);
+}
+
+static void BM_MutableGraphViewHasRegularFanoutFirst(int iters, int num_fanins,
+                                                     int num_fanouts) {
+  BM_GraphViewTHasRegularFanout<MutableGraphView, false>(iters, num_fanins,
+                                                         num_fanouts);
+}
+
+static void BM_MutableGraphViewHasRegularFanoutLast(int iters, int num_fanins,
+                                                    int num_fanouts) {
+  BM_GraphViewTHasRegularFanout<MutableGraphView, true>(iters, num_fanins,
+                                                        num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasRegularFanoutFirst);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasRegularFanoutLast);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasRegularFanoutFirst);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasRegularFanoutLast);
+
+template <typename GraphViewT, bool IsLast>
+inline static void BM_GraphViewTHasControlledFanout(int iters, int num_fanins,
+                                                    int num_fanouts) {
+  testing::StopTiming();
+  GraphDef graph_def = test::CreateFaninFanoutNodeGraph(
+      num_fanins, num_fanouts, num_fanins, num_fanouts,
+      /*fanout_unique_index=*/false);
+  Status s;
+  GraphViewT graph_view(&graph_def, &s);
+  const int index = IsLast ? num_fanins - 1 : 0;
+  auto* node = graph_view.GetNode(absl::StrFormat("control_in%05d", index));
+  auto* fanout = graph_view.GetNode("node");
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    node->HasFanout({&graph_view, fanout->node_index(), Graph::kControlSlot});
+  }
+  testing::StopTiming();
+}
+
+static void BM_GraphViewHasControlledFanoutFirst(int iters, int num_fanins,
+                                                 int num_fanouts) {
+  BM_GraphViewTHasControlledFanout<GraphView, false>(iters, num_fanins,
+                                                     num_fanouts);
+}
+
+static void BM_GraphViewHasControlledFanoutLast(int iters, int num_fanins,
+                                                int num_fanouts) {
+  BM_GraphViewTHasControlledFanout<GraphView, true>(iters, num_fanins,
+                                                    num_fanouts);
+}
+
+static void BM_MutableGraphViewHasControlledFanoutFirst(int iters,
+                                                        int num_fanins,
+                                                        int num_fanouts) {
+  BM_GraphViewTHasControlledFanout<MutableGraphView, false>(iters, num_fanins,
+                                                            num_fanouts);
+}
+
+static void BM_MutableGraphViewHasControlledFanoutLast(int iters,
+                                                       int num_fanins,
+                                                       int num_fanouts) {
+  BM_GraphViewTHasControlledFanout<MutableGraphView, true>(iters, num_fanins,
+                                                           num_fanouts);
+}
+
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasControlledFanoutFirst);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_GraphViewHasControlledFanoutLast);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasControlledFanoutFirst);
+RUN_NUM_FANIN_NUM_FANOUT_BENCHMARK(BM_MutableGraphViewHasControlledFanoutLast);
+
+static void BM_SortTopologically(int iters, int size) {
+  testing::StopTiming();
+
+  GraphDef graph = test::CreateRandomGraph(size);
+  Status status;
+  MutableGraphView graph_view(&graph, &status);
+  TF_ASSERT_OK(status);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; i++) {
+    TF_EXPECT_OK(graph_view.SortTopologically(/*ignore_cycles=*/false, {}));
+  }
+  testing::StopTiming();
+}
+
+RUN_NUM_NODE_BENCHMARK(BM_SortTopologically);
+
+}  // namespace
+}  // namespace utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
index e3e1538b00c..6fdfbf15904 100644
--- a/tensorflow/core/grappler/verifiers/BUILD
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 3746007278e..e58ab4a676b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -11,9 +11,10 @@
 #   # for GPU benchmarks
 #   $ bazel run --config opt --config=cuda //third_party/tensorflow/core/kernels:my_op_test_gpu -- --benchmarks=..
 #
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 package_group(
     name = "friends",
@@ -1264,7 +1265,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "unique_op",
     prefix = "unique_op",
-    deps = ARRAY_DEPS,
+    deps = ARRAY_DEPS + [
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
 )
 
 tf_kernel_library(
@@ -2599,6 +2602,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/lib:traceme",
         "//third_party/eigen3",
     ],
 )
@@ -2616,6 +2620,7 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:stream",
         "@com_google_absl//absl/strings",
     ],
@@ -2712,7 +2717,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "decode_image_op",
     prefix = "decode_image_op",
-    deps = IMAGE_DEPS,
+    deps = IMAGE_DEPS + ["@com_google_absl//absl/strings"],
 )
 
 tf_kernel_library(
@@ -3011,7 +3016,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "identity_reader_op",
     prefix = "identity_reader_op",
-    deps = IO_DEPS,
+    deps = IO_DEPS + ["@com_google_absl//absl/strings"],
 )
 
 tf_kernel_library(
@@ -3077,7 +3082,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "whole_file_read_ops",
     prefix = "whole_file_read_ops",
-    deps = IO_DEPS,
+    deps = IO_DEPS + ["@com_google_absl//absl/strings"],
 )
 
 tf_cc_tests(
@@ -3134,7 +3139,7 @@ tf_kernel_library(
     name = "cuda_solvers",
     srcs = ["cuda_solvers.cc"],
     hdrs = ["cuda_solvers.h"],
-    # @local_config_cuda//cuda:cusolver, //third_party/eigen3:blas,
+    # @local_config_cuda//cuda:cusolver_static, //third_party/eigen3:blas,
     # and //third_party/libf2c all contain various parts of BLAS, LAPACK,
     # and f2c helper functions in global namespace. Tell the compiler to
     # allow multiple definitions when linking this.
@@ -4164,7 +4169,9 @@ tf_kernel_library(
     prefix = "fused_batch_norm_op",
     deps = NN_DEPS + [
         ":fill_functor",
-    ],
+    ] + if_cuda([
+        "//tensorflow/core:stream_executor",
+    ]),
 )
 
 tf_kernel_library(
@@ -6086,6 +6093,7 @@ filegroup(
         "in_topk_op.h",
         "initializable_lookup_table.cc",
         "logging_ops.cc",
+        "logging_ops.h",
         "lookup_table_init_op.cc",
         "lookup_table_op.cc",
         "lookup_util.cc",
diff --git a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
index dede7d249da..d2ccef4ca89 100644
--- a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
+++ b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
@@ -99,45 +99,43 @@ __global__ void adjust_hsv_nhwc(const int64 number_elements,
                                 const float* const value_scale) {
   // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel
   // (NHWC)
-  const int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3;
-  // bounds check
-  if (idx > number_elements - 1) {
-    return;
-  }
-  if (!AdjustHue && !AdjustSaturation && !AdjustV) {
-    output[idx] = input[idx];
-    output[idx + 1] = input[idx + 1];
-    output[idx + 2] = input[idx + 2];
-    return;
-  }
-  const HsvTuple hsv = rgb2hsv_cuda(static_cast<float>(input[idx]),
-                                    static_cast<float>(input[idx + 1]),
-                                    static_cast<float>(input[idx + 2]));
-  float new_h = hsv.h;
-  float new_s = hsv.s;
-  float new_v = hsv.v;
-  // hue adjustment
-  if (AdjustHue) {
-    const float delta = *hue_delta;
-    new_h = fmodf(hsv.h + delta, 1.0f);
-    if (new_h < 0.0f) {
-      new_h = fmodf(1.0f + new_h, 1.0f);
+  for (int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3;
+       idx < number_elements; idx += blockDim.x * gridDim.x * 3) {
+    if (!AdjustHue && !AdjustSaturation && !AdjustV) {
+      output[idx] = input[idx];
+      output[idx + 1] = input[idx + 1];
+      output[idx + 2] = input[idx + 2];
+      continue;
     }
+    const HsvTuple hsv = rgb2hsv_cuda(static_cast<float>(input[idx]),
+                                      static_cast<float>(input[idx + 1]),
+                                      static_cast<float>(input[idx + 2]));
+    float new_h = hsv.h;
+    float new_s = hsv.s;
+    float new_v = hsv.v;
+    // hue adjustment
+    if (AdjustHue) {
+      const float delta = *hue_delta;
+      new_h = fmodf(hsv.h + delta, 1.0f);
+      if (new_h < 0.0f) {
+        new_h = fmodf(1.0f + new_h, 1.0f);
+      }
+    }
+    // saturation adjustment
+    if (AdjustSaturation && saturation_scale != nullptr) {
+      const float scale = *saturation_scale;
+      new_s = fminf(1.0f, fmaxf(0.0f, hsv.s * scale));
+    }
+    // value adjustment
+    if (AdjustV && value_scale != nullptr) {
+      const float scale = *value_scale;
+      new_v = hsv.v * scale;
+    }
+    const RgbTuple rgb = hsv2rgb_cuda(new_h, new_s, new_v);
+    output[idx] = static_cast<T>(rgb.r);
+    output[idx + 1] = static_cast<T>(rgb.g);
+    output[idx + 2] = static_cast<T>(rgb.b);
   }
-  // saturation adjustment
-  if (AdjustSaturation && saturation_scale != nullptr) {
-    const float scale = *saturation_scale;
-    new_s = fminf(1.0f, fmaxf(0.0f, hsv.s * scale));
-  }
-  // value adjustment
-  if (AdjustV && value_scale != nullptr) {
-    const float scale = *value_scale;
-    new_v = hsv.v * scale;
-  }
-  const RgbTuple rgb = hsv2rgb_cuda(new_h, new_s, new_v);
-  output[idx] = static_cast<T>(rgb.r);
-  output[idx + 1] = static_cast<T>(rgb.g);
-  output[idx + 2] = static_cast<T>(rgb.b);
 }
 
 }  // namespace internal
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index 8e4f08e4730..fe185bd1526 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -46,8 +46,8 @@ class BCastArgsOp : public OpKernel {
     BCast bcast(shapes[0], shapes[1]);
     OP_REQUIRES(ctx, bcast.IsValid(),
                 errors::InvalidArgument(
-                    "Incompatible shapes: [", str_util::Join(shapes[0], ","),
-                    "] vs. [", str_util::Join(shapes[1], ","), "]"));
+                    "Incompatible shapes: [", absl::StrJoin(shapes[0], ","),
+                    "] vs. [", absl::StrJoin(shapes[1], ","), "]"));
     Output(ctx, 0, bcast.output_shape());
   }
 
@@ -95,8 +95,8 @@ class BCastGradArgsOp : public OpKernel {
     BCast bcast(shapes[0], shapes[1]);
     OP_REQUIRES(ctx, bcast.IsValid(),
                 errors::InvalidArgument(
-                    "Incompatible shapes: [", str_util::Join(shapes[0], ","),
-                    "] vs. [", str_util::Join(shapes[1], ","), "]"));
+                    "Incompatible shapes: [", absl::StrJoin(shapes[0], ","),
+                    "] vs. [", absl::StrJoin(shapes[1], ","), "]"));
     Output(ctx, 0, bcast.grad_x_reduce_idx());
     Output(ctx, 1, bcast.grad_y_reduce_idx());
   }
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index e31fa09b0f5..08979666d2f 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -441,7 +441,7 @@ class BiasAddParams {
   string ToString() const {
     // clang-format off
     return strings::StrCat(
-        "(", str_util::Join(in_shape_, ", "), "), ",
+        "(", absl::StrJoin(in_shape_, ", "), "), ",
         data_format_, ", ", dtype_, ", ", device_id_);
     // clang-format on
   }
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 2e7316f7f37..5f6ccac1b19 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -6,14 +6,13 @@ package(
         "//tensorflow:__subpackages__",
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
     "tf_cc_test",
+    "tf_kernel_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 5e338fe4c68..1d6d219f7ba 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -74,7 +74,7 @@ TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
 TF_CALL_uint8(REGISTER);
 #undef REGISTER
-#endif  // GOOGLE_CUDA || TENOSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index a75d464c31d..154b1a31c3a 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/concat_lib_gpu.h"
 #include "tensorflow/core/kernels/gpu_device_array.h"
@@ -112,4 +112,4 @@ TF_CALL_uint8(REGISTER);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index e211fa22771..fe9889842da 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -70,7 +70,11 @@ __global__ void concat_variable_kernel(
   IntType num_inputs = input_ptr_data.size;
 
   // verbose declaration needed due to template
+#if GOOGLE_CUDA
   extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+#elif TENSORFLOW_USE_ROCM
+  HIP_DYNAMIC_SHARED(unsigned char, smem);
+#endif
   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
 
   if (useSmem) {
@@ -90,7 +94,7 @@ __global__ void concat_variable_kernel(
   // works well when there are many small segments and when the
   // segments are much longer
   IntType segment =
-      cuda_helper::upper_bound<IntType>(col_scan, num_inputs, gidx) - 1;
+      gpu_helper::upper_bound<IntType>(col_scan, num_inputs, gidx) - 1;
 
   IntType curr_offset = col_scan[segment];
   IntType curr_segment = segment;
@@ -140,14 +144,15 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
                    const GpuDeviceArrayStruct<IntType>& output_scan,
                    bool fixed_size, int split_size,
                    typename TTypes<T, 2>::Matrix* output) {
-  auto config = GetCuda2DLaunchConfig(output->dimension(1),
-                                      output->dimension(0), gpu_device);
+  auto config = GetGpu2DLaunchConfig(output->dimension(1), output->dimension(0),
+                                     gpu_device);
 
   if (fixed_size) {
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
         concat_fixed_kernel<T, IntType>, config.block_count,
         config.thread_per_block, 0, gpu_device.stream(), input_ptrs, split_size,
-        output->dimension(0), output->dimension(1), output->data()));
+        static_cast<int>(output->dimension(0)),
+        static_cast<int>(output->dimension(1)), output->data()));
   } else {
     IntType smem_max = gpu_device.sharedMemPerBlock();
     IntType smem_usage = output_scan.size * sizeof(IntType);
@@ -156,18 +161,19 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
     // possibly due to decreasing occupancy
     // 4096 inputs is a lot, most code will take the smem path
     const int32 kMaxSmemBytesPerformance = 16384;
-    if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
-      TF_CHECK_OK(CudaLaunchKernel(concat_variable_kernel<T, IntType, true>,
-                                   config.block_count, config.thread_per_block,
-                                   smem_usage, gpu_device.stream(), input_ptrs,
-                                   output_scan, output->dimension(0),
-                                   output->dimension(1), output->data()));
-    else
-      TF_CHECK_OK(CudaLaunchKernel(concat_variable_kernel<T, IntType, false>,
-                                   config.block_count, config.thread_per_block,
-                                   0, gpu_device.stream(), input_ptrs,
-                                   output_scan, output->dimension(0),
-                                   output->dimension(1), output->data()));
+    if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance) {
+      TF_CHECK_OK(GpuLaunchKernel(
+          concat_variable_kernel<T, IntType, true>, config.block_count,
+          config.thread_per_block, smem_usage, gpu_device.stream(), input_ptrs,
+          output_scan, static_cast<IntType>(output->dimension(0)),
+          static_cast<IntType>(output->dimension(1)), output->data()));
+    } else {
+      TF_CHECK_OK(GpuLaunchKernel(
+          concat_variable_kernel<T, IntType, false>, config.block_count,
+          config.thread_per_block, 0, gpu_device.stream(), input_ptrs,
+          output_scan, static_cast<IntType>(output->dimension(0)),
+          static_cast<IntType>(output->dimension(1)), output->data()));
+    }
   }
 }
 
@@ -246,4 +252,4 @@ REGISTER_GPU64(bool);
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 72d8b45dd96..ea0c486f304 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -32,9 +32,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 typedef Eigen::GpuDevice GPUDevice;
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
@@ -154,12 +154,12 @@ class ConcatBaseOp : public OpKernel {
     if (output->NumElements() > 0) {
       int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
       auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       if (std::is_same<Device, GPUDevice>::value) {
         ConcatGPU<T>(c, inputs_flat, output, &output_flat);
         return;
       }
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifdef TENSORFLOW_USE_SYCL
       if (std::is_same<Device, SYCLDevice>::value) {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
@@ -197,7 +197,7 @@ REGISTER_CONCAT(qint32);
 
 #undef REGISTER_CONCAT
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(type)                               \
   REGISTER_KERNEL_BUILDER(Name("Concat")                 \
@@ -238,7 +238,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                             .HostMemory("output"),
                         ConcatV2Op<CPUDevice, int32>);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL(type)                              \
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 0eadf4c1714..31bf886854a 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -381,8 +381,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "input image must be 4-D"))
-      << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "input image must be 4-D")) << s;
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
@@ -394,7 +393,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "box_index has incompatible shape"))
+      absl::StrContains(s.ToString(), "box_index has incompatible shape"))
       << s;
 }
 
@@ -406,8 +405,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   AddInputFromArray<int32>(TensorShape({2}), {3, 3});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(str_util::StrContains(
-      s.ToString(), "box_index has values outside [0, batch_size)"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(),
+                                "box_index has values outside [0, batch_size)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index d43fe747333..74cd0efb7c1 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -191,7 +191,7 @@ class CudnnRnnParameters {
         std::to_string(static_cast<int>(rnn_mode_)),
         std::to_string(static_cast<int>(rnn_input_mode_)),
         std::to_string(static_cast<int>(dtype_))};
-    return str_util::Join(fields, ", ");
+    return absl::StrJoin(fields, ", ");
   }
 
  private:
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
index 303d8e47913..ba872db2172 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -23,6 +23,22 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
+template <typename T, int NDIMS>
+struct BCastSelectFunctor<GPUDevice, T, NDIMS> {
+  void operator()(const GPUDevice& d,
+                  typename TTypes<T, NDIMS>::Tensor output_tensor,
+                  typename TTypes<bool, NDIMS>::ConstTensor cond_tensor,
+                  typename TTypes<T, NDIMS>::ConstTensor then_tensor,
+                  typename TTypes<T, NDIMS>::ConstTensor else_tensor,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> cond_bcast,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> then_bcast,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> else_bcast) {
+    output_tensor.device(d) = cond_tensor.broadcast(cond_bcast)
+                                  .select(then_tensor.broadcast(then_bcast),
+                                          else_tensor.broadcast(else_bcast));
+  }
+};
+
 template <typename T>
 struct SelectFunctor<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
@@ -89,10 +105,15 @@ struct BatchSelectFunctor<GPUDevice, T> {
   }
 };
 
-#define SELECT_FUNCTOR(T)                            \
-  template struct SelectFunctor<GPUDevice, T>;       \
-  template struct SelectScalarFunctor<GPUDevice, T>; \
-  template struct BatchSelectFunctor<GPUDevice, T>;
+#define SELECT_FUNCTOR(T)                              \
+  template struct SelectFunctor<GPUDevice, T>;         \
+  template struct SelectScalarFunctor<GPUDevice, T>;   \
+  template struct BatchSelectFunctor<GPUDevice, T>;    \
+  template struct BCastSelectFunctor<GPUDevice, T, 1>; \
+  template struct BCastSelectFunctor<GPUDevice, T, 2>; \
+  template struct BCastSelectFunctor<GPUDevice, T, 3>; \
+  template struct BCastSelectFunctor<GPUDevice, T, 4>; \
+  template struct BCastSelectFunctor<GPUDevice, T, 5>;
 
 SELECT_FUNCTOR(bool);
 SELECT_FUNCTOR(Eigen::half);
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 3b51563ca28..c85c9d0599f 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -143,21 +143,141 @@ class SelectOp : public OpKernel {
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
 };
+template <typename Device, typename T>
+class SelectV2Op : public OpKernel {
+ public:
+  explicit SelectV2Op(OpKernelConstruction* context) : OpKernel(context) {}
 
-#define REGISTER_SELECT(type)                                      \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("Select").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      SelectOp<CPUDevice, type>);
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* cond;
+    const Tensor* then;
+    const Tensor* else_;
+    OP_REQUIRES_OK(ctx, ctx->input("condition", &cond));
+    OP_REQUIRES_OK(ctx, ctx->input("t", &then));
+    OP_REQUIRES_OK(ctx, ctx->input("e", &else_));
+
+    // The `cond`, `then`, and `else` are broadcastable (bcast.IsValid()),
+    // This matches the behavior of numpy.
+    // TODO (yongtang): Consolidate into n-ary broadcast, instead of multiple
+    // 2-ary broadcast.
+
+    // Combine `then` and `else`.
+    BCast then_else_bcast(BCast::FromShape(then->shape()),
+                          BCast::FromShape(else_->shape()), false);
+    OP_REQUIRES(ctx, then_else_bcast.IsValid(),
+                errors::InvalidArgument(
+                    "then ", then->shape().DebugString(), " and else ",
+                    else_->shape().DebugString(), " must be broadcastable"));
+    // Combine `cond` with `then` and `else`.
+    BCast bcast(
+        BCast::FromShape(cond->shape()),
+        BCast::FromShape(BCast::ToShape(then_else_bcast.output_shape())),
+        false);
+    OP_REQUIRES(ctx, bcast.IsValid(),
+                errors::InvalidArgument(
+                    "condition ", cond->shape().DebugString(), ", then ",
+                    then->shape().DebugString(), ", and else ",
+                    else_->shape().DebugString(), " must be broadcastable"));
+
+    // Broadcast `cond`, `then` and `else` to combined shape,
+    // in order to obtain the reshape.
+    BCast cond_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
+                     BCast::FromShape(cond->shape()), false);
+    BCast then_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
+                     BCast::FromShape(then->shape()), false);
+    BCast else_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
+                     BCast::FromShape(else_->shape()), false);
+    OP_REQUIRES(
+        ctx,
+        cond_bcast.IsValid() && then_bcast.IsValid() && else_bcast.IsValid(),
+        errors::InvalidArgument("condition ", cond->shape().DebugString(),
+                                ", then ", then->shape().DebugString(),
+                                ", and else ", else_->shape().DebugString(),
+                                " must be broadcastable"));
+
+    // Combined shape should be the final shape.
+    OP_REQUIRES(
+        ctx,
+        cond_bcast.output_shape() == bcast.output_shape() &&
+            then_bcast.output_shape() == bcast.output_shape() &&
+            else_bcast.output_shape() == bcast.output_shape(),
+        errors::InvalidArgument("condition ", cond->shape().DebugString(),
+                                ", then ", then->shape().DebugString(),
+                                ", and else ", else_->shape().DebugString(),
+                                " must be broadcastable to the same shape"));
+
+    Tensor* output = nullptr;
+    const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"t", "e"}, "output", output_shape, &output));
+
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+#define HANDLE_DIM(NDIMS)                                            \
+  {                                                                  \
+    functor::BCastSelectFunctor<Device, T, NDIMS> func;              \
+    func(ctx->eigen_device<Device>(),                                \
+         output->shaped<T, NDIMS>(bcast.result_shape()),             \
+         cond->template shaped<bool, NDIMS>(cond_bcast.y_reshape()), \
+         then->template shaped<T, NDIMS>(then_bcast.y_reshape()),    \
+         else_->template shaped<T, NDIMS>(else_bcast.y_reshape()),   \
+         BCast::ToIndexArray<NDIMS>(cond_bcast.y_bcast()),           \
+         BCast::ToIndexArray<NDIMS>(then_bcast.y_bcast()),           \
+         BCast::ToIndexArray<NDIMS>(else_bcast.y_bcast()));          \
+  }
+
+    const int ndims = static_cast<int>(bcast.result_shape().size());
+    switch (ndims) {
+      case 1:
+        HANDLE_DIM(1);
+        break;
+      case 2:
+        HANDLE_DIM(2);
+        break;
+      case 3:
+        HANDLE_DIM(3);
+        break;
+      case 4:
+        HANDLE_DIM(4);
+        break;
+      case 5:
+        HANDLE_DIM(5);
+        break;
+      default:
+        ctx->SetStatus(errors::Unimplemented(
+            "Broadcast between ", ctx->input(0).shape().DebugString(), " and ",
+            ctx->input(1).shape().DebugString(), " is not supported yet."));
+        break;
+    }
+    return;
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(SelectV2Op);
+};
+
+#define REGISTER_SELECT(type)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Select").Device(DEVICE_CPU).TypeConstraint<type>("T"),   \
+      SelectOp<CPUDevice, type>);                                    \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("SelectV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      SelectV2Op<CPUDevice, type>);
 
 TF_CALL_ALL_TYPES(REGISTER_SELECT);
 
 #if GOOGLE_CUDA
 
 // Registration of the GPU implementations.
-#define REGISTER_SELECT_GPU(type)                                  \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("Select").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
-      SelectOp<GPUDevice, type>);
+#define REGISTER_SELECT_GPU(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Select").Device(DEVICE_GPU).TypeConstraint<type>("T"),   \
+      SelectOp<GPUDevice, type>);                                    \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("SelectV2").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      SelectV2Op<GPUDevice, type>);
 
 REGISTER_SELECT_GPU(bool);
 REGISTER_SELECT_GPU(Eigen::half);
@@ -174,9 +294,12 @@ REGISTER_SELECT_GPU(complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 // Registration of the SYCL implementations.
-#define REGISTER_SELECT_SYCL(type)                                  \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("Select").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+#define REGISTER_SELECT_SYCL(type)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Select").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
+      SelectOp<SYCLDevice, type>);                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("SelectV2").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       SelectOp<SYCLDevice, type>);
 
 REGISTER_SELECT_SYCL(float);
@@ -324,10 +447,35 @@ struct BatchSelectFunctor<CPUDevice, T> {
   }
 };
 
+template <typename Device, typename T, int NDIMS>
+struct BCastSelectFunctorBase {
+  void operator()(const Device& d,
+                  typename TTypes<T, NDIMS>::Tensor output_tensor,
+                  typename TTypes<bool, NDIMS>::ConstTensor cond_tensor,
+                  typename TTypes<T, NDIMS>::ConstTensor then_tensor,
+                  typename TTypes<T, NDIMS>::ConstTensor else_tensor,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> cond_bcast,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> then_bcast,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> else_bcast) {
+    output_tensor.device(d) = cond_tensor.broadcast(cond_bcast)
+                                  .select(then_tensor.broadcast(then_bcast),
+                                          else_tensor.broadcast(else_bcast));
+  }
+};
+
+template <typename T, int NDIMS>
+struct BCastSelectFunctor<CPUDevice, T, NDIMS>
+    : BCastSelectFunctorBase<CPUDevice, T, NDIMS> {};
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
     : BatchSelectFunctorBase<SYCLDevice, T> {};
+
+template <typename T, int NDIMS>
+struct BCastSelectFunctor<SYCLDevice, T, NDIMS>
+    : BCastSelectFunctorBase<SYCLDevice, T, NDIMS> {};
+
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 12968cf85ac..48297bc2848 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -1208,6 +1208,18 @@ struct BatchSelectFunctor {
                   typename TTypes<T>::ConstMatrix else_flat_outer_dims);
 };
 
+template <typename Device, typename T, int NDIMS>
+struct BCastSelectFunctor {
+  void operator()(const Device& d,
+                  typename TTypes<T, NDIMS>::Tensor output_tensor,
+                  typename TTypes<bool, NDIMS>::ConstTensor cond_tensor,
+                  typename TTypes<T, NDIMS>::ConstTensor then_tensor,
+                  typename TTypes<T, NDIMS>::ConstTensor else_tensor,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> cond_bcast,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> then_bcast,
+                  typename Eigen::array<Eigen::DenseIndex, NDIMS> else_bcast);
+};
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index d35520b3a40..2f763f12024 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -29,6 +29,8 @@ cc_library(
     deps = [
         ":dataset_utils",
         ":iterator_ops",
+        ":name_utils",
+        ":range_dataset_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -73,6 +75,15 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "name_utils",
+    srcs = ["name_utils.cc"],
+    hdrs = ["name_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "stats_utils",
     srcs = ["stats_utils.cc"],
@@ -340,44 +351,14 @@ tf_cc_test(
     ],
 )
 
-tf_kernel_library(
-    name = "filter_by_component_dataset_op",
-    srcs = ["filter_by_component_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "filter_by_component_dataset_op_test",
-    size = "small",
-    srcs = ["filter_by_component_dataset_op_test.cc"],
-    deps = [
-        ":dataset_test_base",
-        ":dataset_utils",
-        ":filter_by_component_dataset_op",
-        ":iterator_ops",
-        ":tensor_slice_dataset_op",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_kernel_library(
     name = "map_dataset_op",
     srcs = ["map_dataset_op.cc"],
+    hdrs = ["map_dataset_op.h"],
     deps = [
         ":captured_function",
         ":dataset_utils",
+        ":name_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -615,6 +596,7 @@ tf_kernel_library(
     srcs = ["prefetch_dataset_op.cc"],
     hdrs = ["prefetch_dataset_op.h"],
     deps = [
+        ":name_utils",
         ":prefetch_autotuner",
         ":stats_utils",
         "//tensorflow/core:core_cpu_internal",
@@ -743,7 +725,9 @@ tf_cc_test(
 tf_kernel_library(
     name = "range_dataset_op",
     srcs = ["range_dataset_op.cc"],
+    hdrs = ["range_dataset_op.h"],
     deps = [
+        ":name_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -982,6 +966,28 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "reduce_dataset_op_test",
+    size = "small",
+    srcs = ["reduce_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        ":stats_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+    ],
+)
+
 tf_kernel_library(
     name = "multi_device_iterator_ops",
     srcs = ["multi_device_iterator_ops.cc"],
@@ -1073,7 +1079,6 @@ tf_kernel_library(
         ":cache_dataset_ops",
         ":concatenate_dataset_op",
         ":dataset_ops",
-        ":filter_by_component_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":generator_dataset_op",
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index fdb84e84f7c..d1140ccfc8f 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -104,22 +104,24 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
 
 Status RunShortCircuit(const ShortCircuitInfo& info,
                        const std::vector<Tensor>& args,
-                       const std::vector<Tensor>& captured_inputs,
+                       const CapturedFunction* const func,
                        std::vector<Tensor>* rets) {
+  VLOG(3) << "Running function " << func->func().name() << " short circuit";
   size_t num_args = args.size();
   for (size_t i = 0; i < info.indices.size(); ++i) {
     if (info.indices[i] < num_args) {
       rets->push_back(args[info.indices[i]]);
     } else {
-      rets->push_back(captured_inputs[info.indices[i] - num_args]);
+      rets->push_back(func->captured_inputs()[info.indices[i] - num_args]);
     }
   }
   return Status::OK();
 }
 
 Status RunShortCircuit(const ShortCircuitInfo& info, std::vector<Tensor>&& args,
-                       const std::vector<Tensor>& captured_inputs,
+                       const CapturedFunction* const func,
                        std::vector<Tensor>* rets) {
+  VLOG(3) << "Running function " << func->func().name() << " short circuit";
   size_t num_args = args.size();
   for (size_t i = 0; i < info.indices.size(); ++i) {
     if (info.indices[i] < num_args) {
@@ -129,7 +131,7 @@ Status RunShortCircuit(const ShortCircuitInfo& info, std::vector<Tensor>&& args,
         rets->push_back(args[info.indices[i]]);
       }
     } else {
-      rets->push_back(captured_inputs[info.indices[i] - num_args]);
+      rets->push_back(func->captured_inputs()[info.indices[i] - num_args]);
     }
   }
   return Status::OK();
@@ -511,8 +513,7 @@ Status InstantiatedCapturedFunction::Run(IteratorContext* ctx,
                                          std::vector<Tensor>* rets) const {
   auto& info = captured_func_->short_circuit_info();
   if (!info.indices.empty()) {
-    return RunShortCircuit(info, std::move(args),
-                           captured_func_->captured_inputs(), rets);
+    return RunShortCircuit(info, std::move(args), captured_func_, rets);
   }
 
   FunctionLibraryRuntime::Options f_opts;
@@ -553,7 +554,7 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
     std::vector<Tensor>* rets) const {
   auto& info = captured_func_->short_circuit_info();
   if (!info.indices.empty()) {
-    return RunShortCircuit(info, args, captured_func_->captured_inputs(), rets);
+    return RunShortCircuit(info, args, captured_func_, rets);
   }
 
   FunctionLibraryRuntime::Options f_opts;
@@ -593,7 +594,7 @@ Status InstantiatedCapturedFunction::RunInstantiated(
     const std::vector<Tensor>& args, std::vector<Tensor>* rets) {
   auto& info = captured_func_->short_circuit_info();
   if (!info.indices.empty()) {
-    return RunShortCircuit(info, args, captured_func_->captured_inputs(), rets);
+    return RunShortCircuit(info, args, captured_func_, rets);
   }
 
   FunctionLibraryRuntime::Options f_opts;
@@ -637,8 +638,7 @@ void InstantiatedCapturedFunction::RunAsync(
     // Run the `done` callback on a threadpool thread, because it will
     // potentially do a non-trivial amount of (e.g. copying) work, and we may
     // want to run that concurrently with the next invocation.
-    Status s = RunShortCircuit(info, std::move(args),
-                               captured_func_->captured_inputs(), rets);
+    Status s = RunShortCircuit(info, std::move(args), captured_func_, rets);
     (*ctx->runner())(
         std::bind([s](FunctionLibraryRuntime::DoneCallback& done) { done(s); },
                   std::move(done)));
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 6765a5af74d..63986dff962 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
 
+#include "tensorflow/core/framework/cancellation.h"
+
 namespace tensorflow {
 namespace data {
 
@@ -54,6 +56,8 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
     break;
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_string(CASE);
+    TF_CALL_uint32(CASE);
+    TF_CALL_uint64(CASE);
     // TODO(feihugis): figure out how to support variant tensors.
 #undef CASE
     default:
@@ -288,21 +292,22 @@ Status DatasetOpsTestBase::CreateOpKernelContext(
     OpKernel* kernel, gtl::InlinedVector<TensorValue, 4>* inputs,
     std::unique_ptr<OpKernelContext>* context) {
   params_ = absl::make_unique<OpKernelContext::Params>();
+  cancellation_manager_ = absl::make_unique<CancellationManager>();
+  params_->cancellation_manager = cancellation_manager_.get();
   params_->device = device_.get();
-  params_->resource_manager = device_->resource_manager();
   params_->frame_iter = FrameAndIter(0, 0);
+  params_->function_library = flr_;
   params_->inputs = inputs;
   params_->op_kernel = kernel;
-  params_->function_library = flr_;
-  params_->runner = &runner_;
-  step_container_ =
-      absl::make_unique<ScopedStepContainer>(0, [](const string&) {});
-  params_->step_container = step_container_.get();
   params_->resource_manager = resource_mgr_.get();
+  params_->runner = &runner_;
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
   slice_reader_cache_ =
       absl::make_unique<checkpoint::TensorSliceReaderCacheWrapper>();
   params_->slice_reader_cache = slice_reader_cache_.get();
+  step_container_ =
+      absl::make_unique<ScopedStepContainer>(0, [](const string&) {});
+  params_->step_container = step_container_.get();
 
   // Set the allocator attributes for the outputs.
   allocator_attrs_.clear();
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
index d82a0c38583..e0554b70f0d 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.h
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_TEST_BASE_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_DATASET_TEST_BASE_H_
 
+#include <memory>
 #include <vector>
 
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
@@ -28,6 +30,8 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
+#include "tensorflow/core/kernels/data/range_dataset_op.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -92,8 +96,10 @@ class DatasetOpsTestBase : public ::testing::Test {
     DataTypeVector dtypes({tensorflow::DataTypeToEnum<T>::value});
     std::vector<PartialTensorShape> shapes({{}});
     NodeDef node_def = test::function::NDef(
-        node_name, "RangeDataset", {"start", "stop", "step"},
-        {{"output_types", dtypes}, {"output_shapes", shapes}});
+        node_name, name_utils::OpName(RangeDatasetOp::kDatasetType),
+        {RangeDatasetOp::kStart, RangeDatasetOp::kStop, RangeDatasetOp::kStep},
+        {{RangeDatasetOp::kOutputTypes, dtypes},
+         {RangeDatasetOp::kOutputShapes, shapes}});
 
     TF_RETURN_IF_ERROR(CreateOpKernel(node_def, range_op_kernel));
     return Status::OK();
@@ -213,6 +219,7 @@ class DatasetOpsTestBase : public ::testing::Test {
   std::unique_ptr<thread::ThreadPool> thread_pool_;
   std::vector<std::unique_ptr<Tensor>> tensors_;  // Owns tensors.
   mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs.
+  std::unique_ptr<CancellationManager> cancellation_manager_;
 };
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index ce31fc3403a..2ba1143290c 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Contains experimental kernels for datasets and iterators.
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -317,6 +318,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/time",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index a7472a49e4a..f0c0096c7e5 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -208,11 +208,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       string BuildTraceMeName() override {
-        int64 parallelism;
-        {
-          tf_shared_lock l(*mu_);
-          parallelism = num_parallel_calls_->value;
-        }
+        // NOTE: We do not synchronize the following access to
+        // num_parallel_calls_ to minimize the tracing overhead.
+        int64 parallelism = num_parallel_calls_->value;
         return strings::StrCat(prefix(), "#parallelism=", parallelism, "#");
       }
 
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 1ff5878bb65..15be3236acb 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "absl/time/clock.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -38,17 +39,15 @@ enum SnapshotMode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
 
 const uint64 kReaderBufferSize = 8 * 1024 * 1024;  // 8 MB
 
-const char* kCompressionType = io::compression::kGzip;
-
 const uint64 kOneDayInMicroseconds = 24L * 60L * 60L * 1e6L;
 
-const uint64 kNumElementsPerShard = 10000;
+const uint64 kNumMBPerShard = 10 * 1024;  // 10 GB per file.
 
 const char kSnapshotFilename[] = "snapshot.metadata";
 
-string GetCurrentSnapshotDataFilename(uint64 next_index,
+string GetCurrentSnapshotDataFilename(uint64 bytes_written,
                                       const string& run_dir) {
-  uint64_t shard_id = next_index / kNumElementsPerShard;
+  uint64_t shard_id = bytes_written / (1024 * 1024 * kNumMBPerShard);
   return absl::StrCat(run_dir, "/", strings::Printf("%08lu", shard_id),
                       ".snapshot");
 }
@@ -57,7 +56,6 @@ Status WriteMetadataFile(const string& fingerprint_dir,
                          const experimental::SnapshotMetadataRecord& metadata) {
   string metadata_filename =
       absl::StrCat(fingerprint_dir, "/", kSnapshotFilename);
-
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(fingerprint_dir));
 
   std::unique_ptr<WritableFile> file;
@@ -117,6 +115,18 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         graph_def_version_(ctx->graph_def_version()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("reader_path_prefix", &reader_path_prefix_));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("writer_path_prefix", &writer_path_prefix_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("compression", &compression_));
+
+    OP_REQUIRES(
+        ctx,
+        compression_ == io::compression::kNone ||
+            compression_ == io::compression::kGzip,
+        errors::InvalidArgument("compression must be either '' or 'GZIP'."));
   }
 
  protected:
@@ -130,26 +140,29 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(
         ctx, AsGraphDef(ctx, input, SerializationContext({}), &graph_def));
 
-    // TODO(frankchn): Find a better way than SerializeToStringDeterministic()
+    // TODO(frankchn): Find a better way than DeterministicProtoHash64()
     // This is not deterministic across different builds of binaries right now.
-    string graph_def_serialized;
-    SerializeToStringDeterministic(graph_def, &graph_def_serialized);
-
     string graph_fingerprint = strings::StrCat(
-        strings::Hex(Fingerprint64(graph_def_serialized), strings::kZeroPad16));
+        strings::Hex(DeterministicProtoHash64(graph_def), strings::kZeroPad16));
 
-    *output = new Dataset(ctx, input, path, graph_fingerprint);
+    *output =
+        new Dataset(ctx, input, path, graph_fingerprint, reader_path_prefix_,
+                    writer_path_prefix_, compression_);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input, const string& path,
-            const string& graph_fingerprint)
+            const string& graph_fingerprint, const string& reader_path_prefix,
+            const string& writer_path_prefix, const string& compression)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           dir_(path),
-          graph_fingerprint_(graph_fingerprint) {
+          graph_fingerprint_(graph_fingerprint),
+          reader_path_prefix_(reader_path_prefix),
+          writer_path_prefix_(writer_path_prefix),
+          compression_(compression) {
       input_->Ref();
     }
 
@@ -179,9 +192,30 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                               Node** output) const override {
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+
       Node* path = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(dir_, &path));
-      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node, path}, output));
+
+      AttrValue compression_attr;
+      b->BuildAttrValue(compression_, &compression_attr);
+
+      AttrValue reader_path_prefix_attr;
+      b->BuildAttrValue(reader_path_prefix_, &reader_path_prefix_attr);
+
+      AttrValue writer_path_prefix_attr;
+      b->BuildAttrValue(writer_path_prefix_, &writer_path_prefix_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          /*inputs=*/
+          {std::make_pair(0, input_graph_node), std::make_pair(1, path)},
+          /*list_inputs=*/
+          {},
+          /*attrs=*/
+          {{"compression", compression_attr},
+           {"reader_path_prefix", reader_path_prefix_attr},
+           {"writer_path_prefix", writer_path_prefix_attr}},
+          output));
       return Status::OK();
     }
 
@@ -255,83 +289,111 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
           run_id_ = metadata_.run_id();
           run_dir_ = absl::StrCat(fingerprint_dir_, "/", run_id_);
+          // Get all the files in the run_dir.
+          TF_RETURN_IF_ERROR(ctx->env()->GetMatchingPaths(
+              absl::StrCat(run_dir_, "/*"), &filenames_));
+          if (filenames_.empty()) {
+            return errors::InvalidArgument("Could not find any files in dir: ",
+                                           run_dir_);
+          }
+          std::sort(filenames_.begin(), filenames_.end());
           return Status::OK();
         }
 
         Status GetNextInternal(IteratorContext* ctx,
                                std::vector<Tensor>* out_tensors,
                                bool* end_of_sequence) override {
+          absl::Time start = absl::Now();
           mutex_lock l(mu_);
+          do {
+            if (current_reader_) {
+              string record_bytes;
+              Status s = current_reader_->ReadRecord(&record_bytes);
+              if (s.ok()) {
+                *end_of_sequence = false;
+                experimental::SnapshotRecord record;
+                record.ParseFromString(record_bytes);
+                int64 num_bytes = 0;
+                for (int i = 0; i < record.tensor_size(); ++i) {
+                  Tensor t;
+                  if (!t.FromProto(record.tensor(i))) {
+                    return errors::DataLoss(
+                        "Unable to parse Tensor from proto.");
+                  }
+                  out_tensors->push_back(t);
+                  num_bytes += t.TotalBytes();
+                }
+                absl::Time end = absl::Now();
+                absl::Duration d = end - start;
+                time_spent_micros_ += absl::ToInt64Microseconds(d);
+                kbytes_written_ += static_cast<double>(num_bytes) / 1024.0;
+                next_index_++;
 
-          string snapshot_data_filename =
-              GetCurrentSnapshotDataFilename(next_index_, run_dir_);
+                if (next_index_ % 10000 == 0) {
+                  LOG(INFO) << "Current read throughput (MBPS): "
+                            << (kbytes_written_ * 1000000.0) /
+                                   (time_spent_micros_ * 1024.0);
+                }
+                return Status::OK();
+              } else if (!errors::IsOutOfRange(s)) {
+                // Report non-EOF errors to the caller.
+                return s;
+              }
+              // Now that we're reached the end of the current file, lets move
+              // on to the next file.
+              ResetReaderLocked();
+              ++current_file_index_;
+            }
 
-          if (current_read_filename_ != snapshot_data_filename) {
-            current_reader_.reset();
-            current_read_file_.reset();
-
-            // The current implementation here assumes that tensors are stored
-            // in files which are named sequentially. If a file doesn't exist
-            // when we try reading that item, we assume that we have reached the
-            // end of the snapshot.
-            Status s = Env::Default()->FileExists(snapshot_data_filename);
-            if (!s.ok()) {
+            if (current_file_index_ == filenames_.size()) {
               *end_of_sequence = true;
               return Status::OK();
             }
 
-            TF_CHECK_OK(Env::Default()->NewRandomAccessFile(
-                snapshot_data_filename, &current_read_file_));
-            auto reader_options =
-                io::RecordReaderOptions::CreateRecordReaderOptions(
-                    kCompressionType);
-            reader_options.buffer_size = kReaderBufferSize;
-
-            current_reader_ = absl::make_unique<io::SequentialRecordReader>(
-                current_read_file_.get(), reader_options);
-            current_read_filename_ = snapshot_data_filename;
-          }
-
-          string record_bytes;
-          Status s = current_reader_->ReadRecord(&record_bytes);
-
-          if (errors::IsOutOfRange(s)) {
-            *end_of_sequence = true;
-            return Status::OK();
-          } else if (!s.ok()) {
-            return s;
-          }
-
-          *end_of_sequence = false;
-          experimental::SnapshotRecord record;
-          record.ParseFromString(record_bytes);
-
-          for (int i = 0; i < record.tensor_size(); ++i) {
-            Tensor t;
-            if (!t.FromProto(record.tensor(i))) {
-              return errors::DataLoss("Unable to parse Tensor from proto.");
-            }
-            out_tensors->push_back(t);
-          }
-
-          next_index_++;
-          return Status::OK();
+            TF_RETURN_IF_ERROR(SetupReaderLocked(ctx->env()));
+          } while (true);
         }
 
        private:
+        Status SetupReaderLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          if (current_file_index_ >= filenames_.size()) {
+            return errors::InvalidArgument("current_files_index_...");
+          }
+          const string filename = absl::StrCat(dataset()->reader_path_prefix_,
+                                               filenames_[current_file_index_]);
+          TF_CHECK_OK(Env::Default()->NewRandomAccessFile(filename,
+                                                          &current_read_file_));
+          auto reader_options =
+              io::RecordReaderOptions::CreateRecordReaderOptions(
+                  dataset()->compression_);
+          reader_options.buffer_size = kReaderBufferSize;
+
+          current_reader_ = absl::make_unique<io::SequentialRecordReader>(
+              current_read_file_.get(), reader_options);
+          return Status::OK();
+        }
+
+        void ResetReaderLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          current_reader_.reset();
+          current_read_file_.reset();
+        }
+
         const string fingerprint_dir_;
         const experimental::SnapshotMetadataRecord metadata_;
         string run_id_ GUARDED_BY(mu_);
         string run_dir_ GUARDED_BY(mu_);
+        std::vector<string> filenames_;
 
         std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
 
-        string current_read_filename_ GUARDED_BY(mu_);
         std::unique_ptr<RandomAccessFile> current_read_file_ GUARDED_BY(mu_);
         std::unique_ptr<io::SequentialRecordReader> current_reader_
             GUARDED_BY(mu_);
 
-        int64 next_index_ GUARDED_BY(mu_) = 0;
+        uint64 next_index_ GUARDED_BY(mu_) = 0;
+        int64 time_spent_micros_ GUARDED_BY(mu_) = 0;
+        double kbytes_written_ GUARDED_BY(mu_) = 0;
+        size_t current_file_index_ GUARDED_BY(mu_) = 0;
 
         mutex mu_;
       };
@@ -348,7 +410,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
           run_id_ = strings::StrCat(
               strings::Hex(random::New64(), strings::kZeroPad4));
-          run_dir_ = absl::StrCat(fingerprint_dir_, "/", run_id_);
+          run_dir_ = absl::StrCat(dataset()->writer_path_prefix_,
+                                  fingerprint_dir_, "/", run_id_);
 
           TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir_));
 
@@ -366,6 +429,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         Status GetNextInternal(IteratorContext* ctx,
                                std::vector<Tensor>* out_tensors,
                                bool* end_of_sequence) override {
+          absl::Time start = absl::Now();
           mutex_lock l(mu_);
 
           TF_RETURN_IF_ERROR(
@@ -394,7 +458,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           }
 
           string snapshot_data_filename =
-              GetCurrentSnapshotDataFilename(next_index_, run_dir_);
+              GetCurrentSnapshotDataFilename(bytes_written_, run_dir_);
 
           if (current_write_filename_ != snapshot_data_filename) {
             if (current_writer_) TF_RETURN_IF_ERROR(current_writer_->Close());
@@ -406,7 +470,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
             auto writer_options =
                 io::RecordWriterOptions::CreateRecordWriterOptions(
-                    kCompressionType);
+                    dataset()->compression_);
 
             TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(
                 snapshot_data_filename, &current_write_file_));
@@ -417,7 +481,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
           experimental::SnapshotRecord record;
 
+          int64 num_bytes = 0;
           for (auto out_tensor : *out_tensors) {
+            num_bytes += out_tensor.TotalBytes();
             TensorProto* t = record.add_tensor();
             out_tensor.AsProtoTensorContent(t);
           }
@@ -425,7 +491,18 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(
               current_writer_->WriteRecord(record.SerializeAsString()));
 
+          absl::Time end = absl::Now();
+          absl::Duration d = end - start;
+          time_spent_micros_ += absl::ToInt64Microseconds(d);
+          bytes_written_ += num_bytes;
+
           next_index_++;
+
+          if (next_index_ % 10000 == 0) {
+            LOG(INFO) << "Current write throughput (MBPS): "
+                      << (bytes_written_ * 1000000.0) /
+                             (time_spent_micros_ * 1024.0 * 1024.0);
+          }
           return Status::OK();
         }
 
@@ -441,6 +518,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         std::unique_ptr<io::RecordWriter> current_writer_ GUARDED_BY(mu_);
 
         uint64 next_index_ GUARDED_BY(mu_) = 0;
+        int64 time_spent_micros_ GUARDED_BY(mu_) = 0;
+        int64 bytes_written_ GUARDED_BY(mu_) = 0;
 
         mutex mu_;
       };
@@ -473,11 +552,19 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
     const DatasetBase* const input_;
     const string dir_;
     const string graph_fingerprint_;
+
+    const string reader_path_prefix_;
+    const string writer_path_prefix_;
+    const string compression_;
   };
 
   const int graph_def_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
+
+  string reader_path_prefix_;
+  string writer_path_prefix_;
+  string compression_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("SnapshotDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/experimental/sql/BUILD b/tensorflow/core/kernels/data/experimental/sql/BUILD
index 464410124e9..923cebf9f03 100644
--- a/tensorflow/core/kernels/data/experimental/sql/BUILD
+++ b/tensorflow/core/kernels/data/experimental/sql/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 cc_library(
     name = "sql",
     srcs = [
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
deleted file mode 100644
index bbcc84db31b..00000000000
--- a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/random/random.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-// TODO(prazek): Filter already has a logic of filtering by the given tensor,
-// but it must return both components.  We could introduce kernel like
-// DropComponentDatasetOp and use FilterDataset for filtering.
-class FilterByLastComponentDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit FilterByLastComponentDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    *output = new Dataset(ctx, input, output_types_, output_shapes_);
-  }
-
- private:
-  const int graph_def_version_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const DataTypeVector& output_types,
-            std::vector<PartialTensorShape> output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          output_types_(output_types),
-          output_shapes_(std::move(output_shapes)) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(Iterator::Params{
-          this, strings::StrCat(prefix, "::FilterByLastComponent")});
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "FilterByLastComponentDatasetOp::Dataset";
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
-          {}, {}, output));
-      return Status::OK();
-    }
-
-   private:
-    const DatasetBase* const input_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        // NOTE(mrry): This method is thread-safe as long as `input_impl_` is
-        // thread-safe. However, if multiple threads enter this method, outputs
-        // may be observed in a non-deterministic order.
-        bool matched;
-        do {
-          {
-            tf_shared_lock l(mu_);
-            if (!input_impl_) {
-              *end_of_sequence = true;
-              return Status::OK();
-            }
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
-          }
-          if (*end_of_sequence) {
-            mutex_lock l(mu_);
-            input_impl_.reset();
-            return Status::OK();
-          }
-
-          const Tensor& last_component = out_tensors->back();
-          if (last_component.NumElements() != 1 ||
-              last_component.dtype() != DT_BOOL) {
-            return errors::InvalidArgument(
-                "Last component must be a bool scalar.");
-          }
-          matched = last_component.scalar<bool>()();
-          out_tensors->pop_back();
-          if (!matched) {
-            // Clear the output tensor list since it didn't match.
-            out_tensors->clear();
-          }
-        } while (!matched);
-        *end_of_sequence = false;
-        return Status::OK();
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeUnknownRatioNode(std::move(args));
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return Status::OK();
-      }
-
-     private:
-      mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    };
-  };
-};
-
-REGISTER_KERNEL_BUILDER(Name("FilterByLastComponentDataset").Device(DEVICE_CPU),
-                        FilterByLastComponentDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc
deleted file mode 100644
index 04627dfae93..00000000000
--- a/tensorflow/core/kernels/data/filter_by_component_dataset_op_test.cc
+++ /dev/null
@@ -1,589 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/data/dataset_test_base.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-constexpr char kNodeName[] = "filter_by_last_component_dataset";
-constexpr char kOpName[] = "FilterByLastComponentDataset";
-
-class FilterByLastComponentDatasetOpTest : public DatasetOpsTestBase {
- protected:
-  // Creates `TensorSliceDataset` variant tensor from the input vector of
-  // tensors.
-  Status CreateTensorSliceDatasetTensor(
-      std::vector<Tensor> *const tensor_vector, Tensor *dataset_tensor) {
-    DatasetBase *tensor_slice_dataset;
-    TF_RETURN_IF_ERROR(CreateTensorSliceDataset(
-        "tensor_slice_node", tensor_vector, &tensor_slice_dataset));
-    TF_RETURN_IF_ERROR(
-        StoreDatasetInVariantTensor(tensor_slice_dataset, dataset_tensor));
-    return Status::OK();
-  }
-
-  // Creates a new `FilterByLastComponentDataset` op kernel.
-  Status CreateFilterByLastComponentDatasetKernel(
-      const DataTypeVector &output_types,
-      const std::vector<PartialTensorShape> &output_shapes,
-      std::unique_ptr<OpKernel> *op_kernel) {
-    NodeDef node_def = test::function::NDef(
-        kNodeName, kOpName, {"input_dataset"},
-        {{"output_types", output_types}, {"output_shapes", output_shapes}});
-    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
-    return Status::OK();
-  }
-
-  // Creates a new `FilterByLastComponentDataset` op kernel context.
-  Status CreateFilterByLastComponentDatasetContext(
-      OpKernel *const op_kernel,
-      gtl::InlinedVector<TensorValue, 4> *const inputs,
-      std::unique_ptr<OpKernelContext> *context) {
-    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
-    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
-    return Status::OK();
-  }
-};
-
-struct TestCase {
-  std::vector<Tensor> input_tensors;
-  std::vector<Tensor> expected_outputs;
-  DataTypeVector expected_output_dtypes;
-  std::vector<PartialTensorShape> expected_output_shapes;
-  int64 expected_cardinality;
-  std::vector<int> breakpoints;
-};
-
-// Test case 1: simple case.
-TestCase TestCase1() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                   {0, 1, 2, 3, 4, 5}),
-           DatasetOpsTestBase::CreateTensor<bool>(TensorShape{3, 1},
-                                                  {true, false, true})},
-          /*expected_outputs*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {0, 1}),
-           DatasetOpsTestBase::CreateTensor<int64>(TensorShape{2}, {4, 5})},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({2})},
-          /*expected_cardinality*/ kUnknownCardinality,
-          /*breakpoints*/ {0, 1, 5}};
-}
-
-// Test case 2: the output of input dataset is empty.
-TestCase TestCase2() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{0}, {})},
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({})},
-          /*expected_cardinality*/ kUnknownCardinality,
-          /*breakpoints*/ {0}};
-}
-
-// Test case 3: the output of input dataset has only one component.
-TestCase TestCase3() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<bool>(TensorShape{3, 1},
-                                                  {true, false, true})},
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_BOOL},
-          /*expected_output_shapes*/ {PartialTensorShape({1})},
-          /*expected_cardinality*/ kUnknownCardinality,
-          /*breakpoints*/ {0, 1, 5}};
-}
-
-// Test case 4: the last component has more than one element.
-TestCase InvalidLastComponentShape() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                   {0, 1, 2, 3, 4, 5}),
-           DatasetOpsTestBase::CreateTensor<bool>(
-               TensorShape{3, 2}, {true, false, true, true, false, true})},
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({2})},
-          /*expected_cardinality*/ kUnknownCardinality,
-          /*breakpoints*/ {}};
-}
-
-// Test case 5: the data type of last component is not DT_BOOL.
-TestCase InvalidLastComponentDType() {
-  return {/*input_tensors*/
-          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape{3, 2},
-                                                   {0, 1, 2, 3, 4, 5}),
-           DatasetOpsTestBase::CreateTensor<int>(TensorShape{3}, {1, 1, 0})},
-          /*expected_outputs*/ {},
-          /*expected_output_dtypes*/ {DT_INT64},
-          /*expected_output_shapes*/ {PartialTensorShape({2})},
-          /*expected_cardinality*/ kUnknownCardinality,
-          /*breakpoints*/ {}};
-}
-
-class ParameterizedFilterByLastComponentDatasetOpTest
-    : public FilterByLastComponentDatasetOpTest,
-      public ::testing::WithParamInterface<TestCase> {};
-
-TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, GetNext) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = GetParam();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(CreateIteratorContext(
-      filter_by_last_component_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
-      iterator_ctx.get(), "Iterator", &iterator));
-  bool end_of_sequence = false;
-  std::vector<Tensor> out_tensors;
-  while (!end_of_sequence) {
-    std::vector<Tensor> next;
-    TF_EXPECT_OK(
-        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
-    out_tensors.insert(out_tensors.end(), next.begin(), next.end());
-  }
-
-  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
-                           /*compare_order*/ true));
-}
-
-TEST_F(FilterByLastComponentDatasetOpTest, DatasetNodeName) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = TestCase1();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  EXPECT_EQ(filter_by_last_component_dataset->node_name(), kNodeName);
-}
-
-TEST_F(FilterByLastComponentDatasetOpTest, DatasetTypeString) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = TestCase1();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  EXPECT_EQ(filter_by_last_component_dataset->type_string(), kOpName);
-}
-
-TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetOutputDtypes) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = GetParam();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  TF_EXPECT_OK(
-      VerifyTypesMatch(filter_by_last_component_dataset->output_dtypes(),
-                       test_case.expected_output_dtypes));
-}
-
-TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetOutputShapes) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = GetParam();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  TF_EXPECT_OK(
-      VerifyShapesCompatible(filter_by_last_component_dataset->output_shapes(),
-                             test_case.expected_output_shapes));
-}
-
-TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, Cardinality) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = GetParam();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  EXPECT_EQ(filter_by_last_component_dataset->Cardinality(),
-            test_case.expected_cardinality);
-}
-
-TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, DatasetSave) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = GetParam();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-  VariantTensorData data;
-  VariantTensorDataWriter writer(&data);
-  TF_ASSERT_OK(
-      filter_by_last_component_dataset->Save(serialization_ctx.get(), &writer));
-  TF_ASSERT_OK(writer.Flush());
-}
-
-TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, IteratorOutputDtypes) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = GetParam();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(CreateIteratorContext(
-      filter_by_last_component_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
-      iterator_ctx.get(), "Iterator", &iterator));
-
-  TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(),
-                                test_case.expected_output_dtypes));
-}
-
-TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, IteratorOutputShapes) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = GetParam();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(CreateIteratorContext(
-      filter_by_last_component_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
-      iterator_ctx.get(), "Iterator", &iterator));
-
-  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
-                                      test_case.expected_output_shapes));
-}
-
-TEST_F(FilterByLastComponentDatasetOpTest, IteratorOutputPrefix) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = TestCase1();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(CreateIteratorContext(
-      filter_by_last_component_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
-      iterator_ctx.get(), "Iterator", &iterator));
-
-  EXPECT_EQ(iterator->prefix(), "Iterator::FilterByLastComponent");
-}
-
-TEST_P(ParameterizedFilterByLastComponentDatasetOpTest, Roundtrip) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  const TestCase &test_case = GetParam();
-
-  std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-      test_case.expected_output_dtypes, test_case.expected_output_shapes,
-      &filter_by_last_component_dataset_kernel));
-
-  Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-  std::vector<Tensor> inputs_for_tensor_slice_dataset = test_case.input_tensors;
-  TF_ASSERT_OK(CreateTensorSliceDatasetTensor(&inputs_for_tensor_slice_dataset,
-                                              &tensor_slice_dataset_tensor));
-  gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-  std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-  TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-      filter_by_last_component_dataset_kernel.get(), &inputs,
-      &filter_by_last_component_dataset_context));
-  DatasetBase *filter_by_last_component_dataset;
-  TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                             filter_by_last_component_dataset_context.get(),
-                             &filter_by_last_component_dataset));
-  core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-  std::unique_ptr<IteratorContext> iterator_ctx;
-  TF_ASSERT_OK(CreateIteratorContext(
-      filter_by_last_component_dataset_context.get(), &iterator_ctx));
-  std::unique_ptr<IteratorBase> iterator;
-  TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
-      iterator_ctx.get(), "Iterator", &iterator));
-  std::unique_ptr<SerializationContext> serialization_ctx;
-  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
-
-  bool end_of_sequence = false;
-  std::vector<Tensor> out_tensors;
-  int cur_iteration = 0;
-  const std::vector<int> &breakpoints = test_case.breakpoints;
-  for (int breakpoint : breakpoints) {
-    VariantTensorData data;
-    VariantTensorDataWriter writer(&data);
-    TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer));
-    TF_EXPECT_OK(writer.Flush());
-    VariantTensorDataReader reader(&data);
-    TF_EXPECT_OK(RestoreIterator(iterator_ctx.get(), &reader, "Iterator",
-                                 *filter_by_last_component_dataset, &iterator));
-
-    while (cur_iteration <= breakpoint) {
-      std::vector<Tensor> next;
-      TF_EXPECT_OK(
-          iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence));
-      out_tensors.insert(out_tensors.end(), next.begin(), next.end());
-      cur_iteration++;
-    }
-  }
-
-  TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs,
-                           /*compare_order*/ true));
-}
-
-INSTANTIATE_TEST_SUITE_P(FilterByLastComponentDatasetOpTest,
-                         ParameterizedFilterByLastComponentDatasetOpTest,
-                         ::testing::ValuesIn(std::vector<TestCase>(
-                             {TestCase1(), TestCase2(), TestCase3()})));
-
-TEST_F(FilterByLastComponentDatasetOpTest, InvalidLastComponent) {
-  int thread_num = 2, cpu_num = 2;
-  TF_ASSERT_OK(InitThreadPool(thread_num));
-  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
-  std::vector<TestCase> test_cases = {InvalidLastComponentShape(),
-                                      InvalidLastComponentDType()};
-  for (const TestCase &test_case : test_cases) {
-    std::unique_ptr<OpKernel> filter_by_last_component_dataset_kernel;
-    TF_ASSERT_OK(CreateFilterByLastComponentDatasetKernel(
-        test_case.expected_output_dtypes, test_case.expected_output_shapes,
-        &filter_by_last_component_dataset_kernel));
-
-    Tensor tensor_slice_dataset_tensor(DT_VARIANT, TensorShape({}));
-    std::vector<Tensor> inputs_for_tensor_slice_dataset =
-        test_case.input_tensors;
-    TF_ASSERT_OK(CreateTensorSliceDatasetTensor(
-        &inputs_for_tensor_slice_dataset, &tensor_slice_dataset_tensor));
-    gtl::InlinedVector<TensorValue, 4> inputs({&tensor_slice_dataset_tensor});
-    std::unique_ptr<OpKernelContext> filter_by_last_component_dataset_context;
-    TF_ASSERT_OK(CreateFilterByLastComponentDatasetContext(
-        filter_by_last_component_dataset_kernel.get(), &inputs,
-        &filter_by_last_component_dataset_context));
-    DatasetBase *filter_by_last_component_dataset;
-    TF_ASSERT_OK(CreateDataset(filter_by_last_component_dataset_kernel.get(),
-                               filter_by_last_component_dataset_context.get(),
-                               &filter_by_last_component_dataset));
-    core::ScopedUnref scoped_unref(filter_by_last_component_dataset);
-
-    std::unique_ptr<IteratorContext> iterator_ctx;
-    TF_ASSERT_OK(CreateIteratorContext(
-        filter_by_last_component_dataset_context.get(), &iterator_ctx));
-    std::unique_ptr<IteratorBase> iterator;
-    TF_ASSERT_OK(filter_by_last_component_dataset->MakeIterator(
-        iterator_ctx.get(), "Iterator", &iterator));
-
-    std::vector<Tensor> next;
-    bool end_of_sequence = false;
-    EXPECT_EQ(
-        iterator->GetNext(iterator_ctx.get(), &next, &end_of_sequence).code(),
-        tensorflow::error::INVALID_ARGUMENT);
-  }
-}
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 8a37dacd6ca..f5907831676 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/data/unbounded_thread_pool.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -748,10 +749,13 @@ class ReduceDatasetOp : public AsyncOpKernel {
         delete raw_iterator;
         done();
       });
+      auto done = []() {};
 
       // Iterate through the input dataset.
       Status status;
       while (true) {
+        OP_REQUIRES_ASYNC(ctx, !ctx->cancellation_manager()->IsCancelled(),
+                          errors::Cancelled("Operation was cancelled"), done);
         std::vector<Tensor> next_input_element;
         bool end_of_input;
         status = raw_iterator->GetNext(&iter_ctx, &next_input_element,
@@ -780,6 +784,19 @@ class ReduceDatasetOp : public AsyncOpKernel {
         ctx->SetStatus(status);
         return;
       }
+
+      OP_REQUIRES_ASYNC(ctx, state.size() == output_types_.size(),
+                        errors::InvalidArgument(
+                            "The number of result elements does not match "
+                            "the size of output types: ",
+                            state.size(), " vs. ", output_types_.size()),
+                        done);
+      OP_REQUIRES_ASYNC(ctx, state.size() == output_shapes_.size(),
+                        errors::InvalidArgument(
+                            "The number of result elements does not match "
+                            "the size of output shapes: ",
+                            state.size(), " vs. ", output_shapes_.size()),
+                        done);
       for (int i = 0; i < state.size(); ++i) {
         OP_REQUIRES_ASYNC(
             ctx, state[i].dtype() == output_types_[i],
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 68b15651755..1d1195ae0aa 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -12,219 +12,204 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/map_dataset_op.h"
+
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 namespace data {
-namespace {
 
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class MapDatasetOp : public UnaryDatasetOpKernel {
- public:
-  using MapIteratorFunction =
-      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
-                           std::vector<Tensor>, std::vector<Tensor>*)>;
+constexpr const char MapDatasetOp::kDatasetType[];
+constexpr const char MapDatasetOp::kInputDataset[];
+constexpr const char MapDatasetOp::kOtherArguments[];
+constexpr const char MapDatasetOp::kF[];
+constexpr const char MapDatasetOp::kTarguments[];
+constexpr const char MapDatasetOp::kOutputTypes[];
+constexpr const char MapDatasetOp::kOutputShapes[];
+constexpr const char MapDatasetOp::kUseInterOpParallelism[];
+constexpr const char MapDatasetOp::kPreserveCardinality[];
 
-  explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
-    FunctionMetadata::Params params;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
-                                     &params.use_inter_op_parallelism));
-    OP_REQUIRES_OK(ctx,
-                   FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(
-        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
+class MapDatasetOp::Dataset : public DatasetBase {
+ public:
+  Dataset(OpKernelContext* ctx, const DatasetBase* input,
+          std::unique_ptr<CapturedFunction> captured_func,
+          const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes,
+          bool preserve_cardinality)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        preserve_cardinality_(preserve_cardinality),
+        captured_func_(std::move(captured_func)),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {
+    input_->Ref();
   }
 
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(ctx, func_metadata_, "other_arguments",
-                                      &captured_func));
+  ~Dataset() override { input_->Unref(); }
 
-    *output = new Dataset(ctx, input, std::move(captured_func), output_types_,
-                          output_shapes_, preserve_cardinality_);
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
+  }
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType);
+  }
+
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+
+    std::vector<Node*> other_arguments;
+    DataTypeVector other_arguments_types;
+    TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
+                                                  &other_arguments_types));
+
+    // Attr: f
+    AttrValue f_attr;
+    b->BuildAttrValue(captured_func_->func(), &f_attr);
+
+    // Attr: Targuments
+    AttrValue other_arguments_types_attr;
+    b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+    // Attr: use_inter_op_parallelism
+    AttrValue use_inter_op_parallelism_attr;
+    b->BuildAttrValue(captured_func_->use_inter_op_parallelism(),
+                      &use_inter_op_parallelism_attr);
+
+    // Attr: preserve_cardinality
+    AttrValue preserve_cardinality_attr;
+    b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+
+    TF_RETURN_IF_ERROR(b->AddDataset(
+        this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
+        {std::make_pair(1, other_arguments)},         // Tensor list inputs.
+        {std::make_pair(kF, f_attr),
+         std::make_pair(kTarguments, other_arguments_types_attr),
+         std::make_pair(kUseInterOpParallelism, use_inter_op_parallelism_attr),
+         std::make_pair(kPreserveCardinality,
+                        preserve_cardinality_attr)},  // Attrs
+        output));
+    return Status::OK();
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            std::unique_ptr<CapturedFunction> captured_func,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes,
-            bool preserve_cardinality)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          preserve_cardinality_(preserve_cardinality),
-          captured_func_(std::move(captured_func)),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      TF_RETURN_IF_ERROR(
+          dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+      return dataset()->captured_func_->Instantiate(
+          ctx, &instantiated_captured_func_);
     }
 
-    ~Dataset() override { input_->Unref(); }
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      // NOTE(mrry): This method is thread-safe as long as
+      // `input_impl_` and `f` are thread-safe. However, if multiple
+      // threads enter this method, outputs may be observed in a
+      // non-deterministic order.
 
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::Map")});
+      std::vector<Tensor> args;
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
+      if (*end_of_sequence) {
+        return Status::OK();
+      }
+
+      Status s =
+          instantiated_captured_func_->Run(ctx, std::move(args), out_tensors);
+      if (errors::IsOutOfRange(s)) {
+        if (dataset()->preserve_cardinality_) {
+          // To guarantee that the transformation preserves the cardinality of
+          // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+          // former may be interpreted by a caller as the end of sequence.
+          return errors::InvalidArgument(
+              "Function invocation produced OutOfRangeError: ",
+              s.error_message());
+        } else {
+          // `f` may deliberately raise `errors::OutOfRange` to indicate
+          // that we should terminate the iteration early.
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+      } else {
+        return s;
+      }
     }
 
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override { return "MapDatasetOp::Dataset"; }
-
-    int64 Cardinality() const override { return input_->Cardinality(); }
-
    protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1);
+    }
 
-      std::vector<Node*> other_arguments;
-      DataTypeVector other_arguments_types;
-      TF_RETURN_IF_ERROR(captured_func_->AddToGraph(ctx, b, &other_arguments,
-                                                    &other_arguments_types));
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+      return Status::OK();
+    }
 
-      // Attr: f
-      AttrValue f_attr;
-      b->BuildAttrValue(captured_func_->func(), &f_attr);
-
-      // Attr: Targuments
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
-
-      // Attr: use_inter_op_parallelism
-      AttrValue use_inter_op_parallelism_attr;
-      b->BuildAttrValue(captured_func_->use_inter_op_parallelism(),
-                        &use_inter_op_parallelism_attr);
-
-      // Attr: preserve_cardinality
-      AttrValue preserve_cardinality_attr;
-      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
-
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
-          {std::make_pair(1, other_arguments)},         // Tensor list inputs.
-          {std::make_pair("f", f_attr),
-           std::make_pair("Targuments", other_arguments_types_attr),
-           std::make_pair("use_inter_op_parallelism",
-                          use_inter_op_parallelism_attr),
-           std::make_pair("preserve_cardinality",
-                          preserve_cardinality_attr)},  // Attrs
-          output));
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status Initialize(IteratorContext* ctx) override {
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(
-            ctx, &instantiated_captured_func_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        // NOTE(mrry): This method is thread-safe as long as
-        // `input_impl_` and `f` are thread-safe. However, if multiple
-        // threads enter this method, outputs may be observed in a
-        // non-deterministic order.
-
-        std::vector<Tensor> args;
-        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
-        if (*end_of_sequence) {
-          return Status::OK();
-        }
-
-        Status s =
-            instantiated_captured_func_->Run(ctx, std::move(args), out_tensors);
-        if (errors::IsOutOfRange(s)) {
-          if (dataset()->preserve_cardinality_) {
-            // To guarantee that the transformation preserves the cardinality of
-            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
-            // former may be interpreted by a caller as the end of sequence.
-            return errors::InvalidArgument(
-                "Function invocation produced OutOfRangeError: ",
-                s.error_message());
-          } else {
-            // `f` may deliberately raise `errors::OutOfRange` to indicate
-            // that we should terminate the iteration early.
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-        } else {
-          return s;
-        }
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return Status::OK();
-      }
-
-     private:
-      std::unique_ptr<IteratorBase> input_impl_;
-      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
-    };
-
-    const DatasetBase* const input_;
-    const bool preserve_cardinality_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
+    std::unique_ptr<IteratorBase> input_impl_;
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
   };
 
-  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  bool preserve_cardinality_;
+  const DatasetBase* const input_;
+  const bool preserve_cardinality_;
+  const std::unique_ptr<CapturedFunction> captured_func_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
 };
 
+void MapDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                               DatasetBase** output) {
+  std::unique_ptr<CapturedFunction> captured_func;
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, func_metadata_, kOtherArguments,
+                                          &captured_func));
+
+  *output = new Dataset(ctx, input, std::move(captured_func), output_types_,
+                        output_shapes_, preserve_cardinality_);
+}
+
+namespace {
 REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("ExperimentalMapDataset")
                             .Device(DEVICE_GPU)
                             .HostMemory("input_dataset")
                             .HostMemory("handle"),
                         MapDatasetOp);
-
 }  // namespace
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/map_dataset_op.h b/tensorflow/core/kernels/data/map_dataset_op.h
new file mode 100644
index 00000000000..90f451a219b
--- /dev/null
+++ b/tensorflow/core/kernels/data/map_dataset_op.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_MAP_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_MAP_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+
+namespace tensorflow {
+namespace data {
+
+class MapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  using MapIteratorFunction =
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>, std::vector<Tensor>*)>;
+
+  static constexpr const char kDatasetType[] = "Map";
+  static constexpr const char kInputDataset[] = "input_dataset";
+  static constexpr const char kOtherArguments[] = "other_arguments";
+  static constexpr const char kF[] = "f";
+  static constexpr const char kTarguments[] = "Targuments";
+  static constexpr const char kOutputTypes[] = "output_types";
+  static constexpr const char kOutputShapes[] = "output_shapes";
+  static constexpr const char kUseInterOpParallelism[] =
+      "use_inter_op_parallelism";
+  static constexpr const char kPreserveCardinality[] = "preserve_cardinality";
+
+  explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
+    FunctionMetadata::Params params;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kUseInterOpParallelism,
+                                     &params.use_inter_op_parallelism));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, kF, params, &func_metadata_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr(kPreserveCardinality, &preserve_cardinality_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+  std::shared_ptr<FunctionMetadata> func_metadata_ = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  bool preserve_cardinality_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_MAP_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/map_dataset_op_test.cc b/tensorflow/core/kernels/data/map_dataset_op_test.cc
index ac70d39cda3..372e4f50123 100644
--- a/tensorflow/core/kernels/data/map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op_test.cc
@@ -13,29 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/function_handle_cache.h"
-#include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/map_dataset_op.h"
+
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/kernels/data/iterator_ops.h"
-#include "tensorflow/core/kernels/data/stats_utils.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
 constexpr char kNodeName[] = "map_dataset";
-constexpr char kOpName[] = "MapDataset";
 
 class MapDatasetOpTest : public DatasetOpsTestBase {
  protected:
@@ -43,21 +29,19 @@ class MapDatasetOpTest : public DatasetOpsTestBase {
   // same with the node name of the input dataset for the method
   // `CreateMapDatasetContext()`. `T` specifies the output dtype of MapDataset.
   template <typename T>
-  Status CreateMapDatasetOpKernel(const string& input_dataset,
-                                  const string& func_name,
-                                  std::unique_ptr<OpKernel>* map_kernel) {
-    FunctionDefHelper::AttrValueWrapper func =
-        FunctionDefHelper::FunctionRef(func_name, {{"T", DT_INT64}});
-
+  Status CreateMapDatasetOpKernel(
+      const FunctionDefHelper::AttrValueWrapper& func,
+      std::unique_ptr<OpKernel>* map_kernel) {
     NodeDef map_dataset_node_def = test::function::NDef(
-        kNodeName, kOpName, {input_dataset},
-        {{"f", func},
-         {"Targuments", {}},
-         {"output_shapes", gtl::ArraySlice<TensorShape>{{}}},
-         {"output_types",
+        kNodeName, name_utils::OpName(MapDatasetOp::kDatasetType),
+        {MapDatasetOp::kInputDataset},
+        {{MapDatasetOp::kF, func},
+         {MapDatasetOp::kTarguments, {}},
+         {MapDatasetOp::kOutputShapes, gtl::ArraySlice<TensorShape>{{}}},
+         {MapDatasetOp::kOutputTypes,
           gtl::ArraySlice<DataType>{tensorflow::DataTypeToEnum<T>::value}},
-         {"use_inter_op_parallelism", true},
-         {"preserve_cardinality", false}});
+         {MapDatasetOp::kUseInterOpParallelism, true},
+         {MapDatasetOp::kPreserveCardinality, false}});
     TF_RETURN_IF_ERROR(CreateOpKernel(map_dataset_node_def, map_kernel));
     return Status::OK();
   }
@@ -76,7 +60,7 @@ struct TestCase {
   int64 start;
   int64 end;
   int64 step;
-  string func_name;
+  FunctionDefHelper::AttrValueWrapper func;
   std::vector<FunctionDef> func_lib;
   std::vector<Tensor> expected_outputs;
   DataTypeVector expected_output_dtypes;
@@ -89,7 +73,8 @@ TestCase TestCase1() {
   return {/*start*/ 0,
           /*end*/ 10,
           /*step*/ 3,
-          /*func_name*/ "XTimesTwo",
+          /*func*/
+          FunctionDefHelper::FunctionRef("XTimesTwo", {{"T", DT_INT64}}),
           /*func_lib*/ {test::function::XTimesTwo()},
           /*expected_outputs*/
           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
@@ -106,7 +91,8 @@ TestCase TestCase2() {
   return {/*start*/ 10,
           /*end*/ 0,
           /*step*/ -3,
-          /*func_name*/ "XAddX",
+          /*func*/
+          FunctionDefHelper::FunctionRef("XAddX", {{"T", DT_INT64}}),
           /*func_lib*/ {test::function::XAddX()},
           /*expected_outputs*/
           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {20}),
@@ -126,7 +112,8 @@ TestCase TestCase3() {
       /*start*/ 0,
       /*end*/ 10,
       /*step*/ 3,
-      /*func_name*/ "XTimesFour",
+      /*func*/
+      FunctionDefHelper::FunctionRef("XTimesFour", {{"T", DT_INT64}}),
       /*func_lib*/ {test::function::XTimesTwo(), test::function::XTimesFour()},
       /*expected_outputs*/
       {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
@@ -161,8 +148,8 @@ TEST_P(ParameterizedMapDatasetOpTest, GetNext) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -211,8 +198,8 @@ TEST_F(MapDatasetOpTest, DatasetNodeName) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -242,8 +229,8 @@ TEST_F(MapDatasetOpTest, DatasetTypeString) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -252,7 +239,8 @@ TEST_F(MapDatasetOpTest, DatasetTypeString) {
                              map_dataset_context.get(), &map_dataset));
   core::ScopedUnref scoped_unref_map_dataset(map_dataset);
 
-  EXPECT_EQ(map_dataset->type_string(), kOpName);
+  EXPECT_EQ(map_dataset->type_string(),
+            name_utils::OpName(MapDatasetOp::kDatasetType));
 }
 
 TEST_F(MapDatasetOpTest, DatasetOutputDtypes) {
@@ -273,8 +261,8 @@ TEST_F(MapDatasetOpTest, DatasetOutputDtypes) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -305,8 +293,8 @@ TEST_F(MapDatasetOpTest, DatasetOutputShapes) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -337,8 +325,8 @@ TEST_P(ParameterizedMapDatasetOpTest, Cardinality) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -368,8 +356,8 @@ TEST_P(ParameterizedMapDatasetOpTest, DatasetSave) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -404,8 +392,8 @@ TEST_F(MapDatasetOpTest, IteratorOutputDtypes) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -443,8 +431,8 @@ TEST_F(MapDatasetOpTest, IteratorOutputShapes) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -482,8 +470,8 @@ TEST_F(MapDatasetOpTest, IteratorOutputPrefix) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
@@ -499,7 +487,8 @@ TEST_F(MapDatasetOpTest, IteratorOutputPrefix) {
   TF_ASSERT_OK(
       map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
 
-  EXPECT_EQ(iterator->prefix(), "Iterator::Map");
+  EXPECT_EQ(iterator->prefix(),
+            name_utils::IteratorPrefix(MapDatasetOp::kDatasetType, "Iterator"));
 }
 
 TEST_P(ParameterizedMapDatasetOpTest, Roundtrip) {
@@ -520,8 +509,8 @@ TEST_P(ParameterizedMapDatasetOpTest, Roundtrip) {
   map_dataset_inputs.emplace_back(&range_dataset_tensor);
 
   std::unique_ptr<OpKernel> map_dataset_kernel;
-  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
-      range_dataset->node_name(), test_case.func_name, &map_dataset_kernel));
+  TF_ASSERT_OK(
+      CreateMapDatasetOpKernel<int64>(test_case.func, &map_dataset_kernel));
   std::unique_ptr<OpKernelContext> map_dataset_context;
   TF_ASSERT_OK(CreateMapDatasetContext(
       map_dataset_kernel.get(), &map_dataset_inputs, &map_dataset_context));
diff --git a/tensorflow/core/kernels/data/name_utils.cc b/tensorflow/core/kernels/data/name_utils.cc
new file mode 100644
index 00000000000..5534c7f565a
--- /dev/null
+++ b/tensorflow/core/kernels/data/name_utils.cc
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/name_utils.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace data {
+namespace name_utils {
+
+ABSL_CONST_INIT const char kDelimiter[] = "::";
+
+string OpName(const string& dataset_type) {
+  if (dataset_type == "Map") {
+    return "MapDataset";
+  } else if (dataset_type == "Prefetch") {
+    return "PrefetchDataset";
+  } else if (dataset_type == "Range") {
+    return "RangeDataset";
+  }
+  LOG(WARNING) << "Unknown dataset type " << dataset_type << std::endl;
+  return "UnknownDataset";
+}
+
+string DatasetDebugString(const string& dataset_type,
+                          std::initializer_list<StringPiece> args) {
+  if (args.size() == 0) {
+    return strings::StrCat(dataset_type, "Op", kDelimiter, "Dataset");
+  }
+
+  string debug_str;
+  strings::StrAppend(&debug_str, dataset_type, "Op(");
+  auto iter = args.begin();
+  while (iter != args.end() - 1) {
+    strings::StrAppend(&debug_str, *iter, ", ");
+    ++iter;
+  }
+  strings::StrAppend(&debug_str, *iter, ")", kDelimiter, "Dataset");
+  return debug_str;
+}
+
+string IteratorPrefix(const string& dataset_type, const string& prefix) {
+  return strings::StrCat(prefix, kDelimiter, dataset_type);
+}
+
+}  // namespace name_utils
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/name_utils.h b/tensorflow/core/kernels/data/name_utils.h
new file mode 100644
index 00000000000..44d14b3fc3d
--- /dev/null
+++ b/tensorflow/core/kernels/data/name_utils.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_NAME_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_NAME_UTILS_H_
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace data {
+namespace name_utils {
+
+extern const char kDelimiter[];
+
+// Returns the dataset op name.
+//
+// e.g. OpName("Map") -> "MapDataset".
+string OpName(const string& dataset_type);
+
+// Returns a human-readable debug string for this dataset in the format of
+// "FooDatasetOp(arg1, arg2, ...)::Dataset".
+//
+// e.g. DatasetDebugString("Map", {}) -> "MapDatasetOp::Dataset";
+// DatasetDebugString("Range", {"0", "10", "3"}) ->
+// "RangeDatasetOp(0, 10, 3)::Dataset".
+string DatasetDebugString(const string& dataset_type,
+                          std::initializer_list<StringPiece> args);
+
+// Returns a human-readable debug string for this dataset in the format of
+// "FooDatasetOp(arg1, arg2, ...)::Dataset".
+//
+// e.g. DatasetDebugString("Map") -> "MapDatasetOp::Dataset";
+// DatasetDebugString("Range", 0, 10, 3) -> "RangeDatasetOp(0, 10, 3)::Dataset".
+template <typename... Args>
+string DatasetDebugString(const string& dataset_type, const Args&... args) {
+  return DatasetDebugString(
+      dataset_type, {static_cast<const strings::AlphaNum&>(args).Piece()...});
+}
+
+// Returns a string that identifies the sequence of iterators leading up to
+// the iterator of this dataset.
+//
+// e.g. IteratorPrefix("Map", "Iterator::range") -> "Iterator::Range::Map".
+string IteratorPrefix(const string& dataset_type, const string& prefix);
+
+}  // namespace name_utils
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_NAME_UTILS_H_
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 835b2387c1e..1a09e33cad8 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -213,11 +213,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       }
 
       string BuildTraceMeName() override {
-        int64 parallelism;
-        {
-          tf_shared_lock l(*mu_);
-          parallelism = num_parallel_calls_->value;
-        }
+        // NOTE: We do not synchronize the following access to
+        // num_parallel_calls_ to minimize the tracing overhead.
+        int64 parallelism = num_parallel_calls_->value;
         return strings::StrCat(prefix(), "#parallelism=", parallelism, "#");
       }
 
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 52befecb12e..52be8793c67 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -74,11 +74,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   string BuildTraceMeName() override {
-    int64 parallelism;
-    {
-      tf_shared_lock l(*mu_);
-      parallelism = num_parallel_calls_->value;
-    }
+    // NOTE: We do not synchronize the following access to num_parallel_calls_
+    // to minimize the tracing overhead.
+    int64 parallelism = num_parallel_calls_->value;
     return strings::StrCat(prefix(), "#parallelism=", parallelism, "#");
   }
 
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index e356044492c..38e1f3fa210 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/stats_utils.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -34,7 +35,12 @@ namespace data {
 
 // Determines the fraction of slack time by which to delay prefetching of data.
 constexpr double kSleepFactor = 0.2;
-constexpr char kDatasetName[] = "Prefetch";
+constexpr const char PrefetchDatasetOp::kDatasetType[];
+constexpr const char PrefetchDatasetOp::kInputDataset[];
+constexpr const char PrefetchDatasetOp::kBufferSize[];
+constexpr const char PrefetchDatasetOp::kOutputTypes[];
+constexpr const char PrefetchDatasetOp::kOutputShapes[];
+constexpr const char PrefetchDatasetOp::kSlackPeriod[];
 
 class PrefetchDatasetOp::Dataset : public DatasetBase {
  public:
@@ -51,8 +57,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return absl::make_unique<Iterator>(
-        Iterator::Params{this, strings::StrCat(prefix, "::", kDatasetName)});
+    return absl::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -63,7 +69,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
+  string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType);
+  }
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
@@ -79,7 +87,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     b->BuildAttrValue(slack_period_, &slack_period_attr);
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {input_graph_node, buffer_size},
-        {std::make_pair("slack_period", slack_period_attr)}, output));
+        {std::make_pair(kSlackPeriod, slack_period_attr)}, output));
     return Status::OK();
   }
 
@@ -183,7 +191,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
       TF_RETURN_IF_ERROR(
-          writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
+          writer->WriteScalar(full_name(kBufferSize), buffer_.size()));
       for (size_t i = 0; i < buffer_.size(); i++) {
         auto& buffer_element = buffer_[i];
         TF_RETURN_IF_ERROR(WriteStatus(writer, i, buffer_element.status));
@@ -210,7 +218,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       size_t buffer_size;
       {
         int64 temp;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("buffer_size"), &temp));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kBufferSize), &temp));
         buffer_size = static_cast<size_t>(temp);
       }
       for (size_t i = 0; i < buffer_size; i++) {
@@ -436,7 +444,7 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                     DatasetBase** output) {
   int64 buffer_size = 0;
   OP_REQUIRES_OK(ctx,
-                 ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+                 ParseScalarArgument<int64>(ctx, kBufferSize, &buffer_size));
   OP_REQUIRES(ctx,
               buffer_size >= 0 || buffer_size == PrefetchAutotuner::kAutoTune,
               errors::InvalidArgument("buffer_size must be >= 0 or set "
@@ -445,7 +453,7 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                       " for auto-tuning"));
 
   if (buffer_size == PrefetchAutotuner::kAutoTune) {
-    metrics::RecordTFDataAutotune(kDatasetName);
+    metrics::RecordTFDataAutotune(kDatasetType);
   }
 
   *output = new Dataset(ctx, input, buffer_size, slack_period_);
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.h b/tensorflow/core/kernels/data/prefetch_dataset_op.h
index d42e14373bd..e23cd2495db 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.h
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.h
@@ -24,10 +24,17 @@ namespace data {
 
 class PrefetchDatasetOp : public UnaryDatasetOpKernel {
  public:
+  static constexpr const char kDatasetType[] = "Prefetch";
+  static constexpr const char kInputDataset[] = "input_dataset";
+  static constexpr const char kBufferSize[] = "buffer_size";
+  static constexpr const char kOutputTypes[] = "output_types";
+  static constexpr const char kOutputShapes[] = "output_shapes";
+  static constexpr const char kSlackPeriod[] = "slack_period";
+
   explicit PrefetchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    if (ctx->HasAttr("slack_period")) {
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("slack_period", &slack_period_));
+    if (ctx->HasAttr(kSlackPeriod)) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr(kSlackPeriod, &slack_period_));
     }
   }
 
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
index 56dfbc510e8..d3dd0f89bf1 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/data/prefetch_dataset_op.h"
+
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
 
 namespace tensorflow {
@@ -17,7 +19,6 @@ namespace data {
 namespace {
 
 constexpr char kNodeName[] = "prefetch_dataset";
-constexpr char kOpName[] = "PrefetchDataset";
 
 class PrefetchDatasetOpTest : public DatasetOpsTestBase {
  protected:
@@ -38,11 +39,12 @@ class PrefetchDatasetOpTest : public DatasetOpsTestBase {
       const DataTypeVector &output_types,
       const std::vector<PartialTensorShape> &output_shapes,
       std::unique_ptr<OpKernel> *op_kernel) {
-    NodeDef node_def = test::function::NDef(kNodeName, kOpName,
-                                            {"input_dataset", "buffer_size"},
-                                            {{"output_types", output_types},
-                                             {"output_shapes", output_shapes},
-                                             {"slack_period", 0}});
+    NodeDef node_def = test::function::NDef(
+        kNodeName, name_utils::OpName(PrefetchDatasetOp::kDatasetType),
+        {PrefetchDatasetOp::kInputDataset, PrefetchDatasetOp::kBufferSize},
+        {{PrefetchDatasetOp::kOutputTypes, output_types},
+         {PrefetchDatasetOp::kOutputShapes, output_shapes},
+         {PrefetchDatasetOp::kSlackPeriod, 0}});
     TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
     return Status::OK();
   }
@@ -293,7 +295,8 @@ TEST_F(PrefetchDatasetOpTest, DatasetTypeString) {
                              &prefetch_dataset));
   core::ScopedUnref scoped_unref(prefetch_dataset);
 
-  EXPECT_EQ(prefetch_dataset->type_string(), kOpName);
+  EXPECT_EQ(prefetch_dataset->type_string(),
+            name_utils::OpName(PrefetchDatasetOp::kDatasetType));
 }
 
 TEST_F(PrefetchDatasetOpTest, DatasetOutputDtypes) {
@@ -547,7 +550,9 @@ TEST_F(PrefetchDatasetOpTest, IteratorOutputPrefix) {
   TF_ASSERT_OK(prefetch_dataset->MakeIterator(iterator_ctx.get(), "Iterator",
                                               &iterator));
 
-  EXPECT_EQ(iterator->prefix(), "Iterator::Prefetch");
+  EXPECT_EQ(
+      iterator->prefix(),
+      name_utils::IteratorPrefix(PrefetchDatasetOp::kDatasetType, "Iterator"));
 }
 
 TEST_P(ParameterizedPrefetchDatasetOpTest, Roundtrip) {
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 87390ad512f..7c539ac97f8 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -12,147 +12,149 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/range_dataset_op.h"
+
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
 
 namespace tensorflow {
 namespace data {
-namespace {
 
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class RangeDatasetOp : public DatasetOpKernel {
+constexpr const char RangeDatasetOp::kDatasetType[];
+constexpr const char RangeDatasetOp::kStart[];
+constexpr const char RangeDatasetOp::kStop[];
+constexpr const char RangeDatasetOp::kStep[];
+constexpr const char RangeDatasetOp::kOutputTypes[];
+constexpr const char RangeDatasetOp::kOutputShapes[];
+
+class RangeDatasetOp::Dataset : public DatasetBase {
  public:
-  explicit RangeDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+  Dataset(OpKernelContext* ctx, int64 start, int64 stop, int64 step)
+      : DatasetBase(DatasetContext(ctx)),
+        start_(start),
+        stop_(stop),
+        step_(step) {}
 
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    int64 start;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "start", &start));
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(Iterator::Params{
+        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
+  }
 
-    int64 stop;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "stop", &stop));
+  const DataTypeVector& output_dtypes() const override {
+    static DataTypeVector* dtypes = new DataTypeVector({DT_INT64});
+    return *dtypes;
+  }
 
-    int64 step;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "step", &step));
-    OP_REQUIRES(ctx, step != 0,
-                errors::InvalidArgument("step must be a non-zero integer."));
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    static std::vector<PartialTensorShape>* shapes =
+        new std::vector<PartialTensorShape>({PartialTensorShape({})});
+    return *shapes;
+  }
 
-    *output = new Dataset(ctx, start, stop, step);
+  string DebugString() const override {
+    return name_utils::DatasetDebugString(kDatasetType, start_, stop_, step_);
+  }
+
+  int64 Cardinality() const override {
+    if (step_ > 0) {
+      return std::max(0LL, (stop_ - start_ - 1) / step_ + 1);
+    } else {
+      return std::max(0LL, (start_ - stop_ - 1) / -step_ + 1);
+    }
+  }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* start = nullptr;
+    Node* stop = nullptr;
+    Node* step = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(start_, &start));
+    TF_RETURN_IF_ERROR(b->AddScalar(stop_, &stop));
+    TF_RETURN_IF_ERROR(b->AddScalar(step_, &step));
+    TF_RETURN_IF_ERROR(b->AddDataset(this, {start, stop, step}, output));
+    return Status::OK();
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, int64 start, int64 stop, int64 step)
-        : DatasetBase(DatasetContext(ctx)),
-          start_(start),
-          stop_(stop),
-          step_(step) {}
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::Range")});
+    explicit Iterator(const Params& params) : DatasetIterator<Dataset>(params) {
+      next_ = params.dataset->start_;
     }
 
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_INT64});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({PartialTensorShape({})});
-      return *shapes;
-    }
-
-    string DebugString() const override {
-      return strings::StrCat("RangeDatasetOp(", start_, ", ", stop_, ", ",
-                             step_, ")::Dataset");
-    }
-
-    int64 Cardinality() const override {
-      if (step_ > 0) {
-        return std::max(0LL, (stop_ - start_ - 1) / step_ + 1);
-      } else {
-        return std::max(0LL, (start_ - stop_ - 1) / -step_ + 1);
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      mutex_lock l(mu_);
+      if ((dataset()->step_ > 0 && next_ >= dataset()->stop_) ||
+          (dataset()->step_ < 0 && next_ <= dataset()->stop_)) {
+        *end_of_sequence = true;
+        return Status::OK();
       }
+      out_tensors->reserve(1);
+      out_tensors->emplace_back(next_);
+      *end_of_sequence = false;
+      next_ += dataset()->step_;
+
+      return Status::OK();
     }
 
    protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* start = nullptr;
-      Node* stop = nullptr;
-      Node* step = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(start_, &start));
-      TF_RETURN_IF_ERROR(b->AddScalar(stop_, &stop));
-      TF_RETURN_IF_ERROR(b->AddScalar(step_, &step));
-      TF_RETURN_IF_ERROR(b->AddDataset(this, {start, stop, step}, output));
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeSourceNode(std::move(args));
+    }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("next"), next_));
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next"), &next_));
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {
-        next_ = params.dataset->start_;
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        if ((dataset()->step_ > 0 && next_ >= dataset()->stop_) ||
-            (dataset()->step_ < 0 && next_ <= dataset()->stop_)) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-        out_tensors->reserve(1);
-        out_tensors->emplace_back(next_);
-        *end_of_sequence = false;
-        next_ += dataset()->step_;
-
-        return Status::OK();
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeSourceNode(std::move(args));
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("next"), next_));
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next"), &next_));
-        return Status::OK();
-      }
-
-     private:
-      mutex mu_;
-      int64 next_ GUARDED_BY(mu_);
-    };
-
-    const int64 start_;
-    const int64 stop_;
-    const int64 step_;
+    mutex mu_;
+    int64 next_ GUARDED_BY(mu_);
   };
+
+  const int64 start_;
+  const int64 stop_;
+  const int64 step_;
 };
 
+void RangeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) {
+  int64 start;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kStart, &start));
+
+  int64 stop;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kStop, &stop));
+
+  int64 step;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kStep, &step));
+  OP_REQUIRES(ctx, step != 0,
+              errors::InvalidArgument("step must be a non-zero integer."));
+
+  *output = new Dataset(ctx, start, stop, step);
+}
+
+namespace {
 REGISTER_KERNEL_BUILDER(Name("RangeDataset").Device(DEVICE_CPU),
                         RangeDatasetOp);
-
 }  // namespace
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/range_dataset_op.h b/tensorflow/core/kernels/data/range_dataset_op.h
new file mode 100644
index 00000000000..c4249b9ef78
--- /dev/null
+++ b/tensorflow/core/kernels/data/range_dataset_op.h
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_RANGE_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_RANGE_DATASET_OP_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class RangeDatasetOp : public DatasetOpKernel {
+ public:
+  static constexpr const char kDatasetType[] = "Range";
+  static constexpr const char kStart[] = "start";
+  static constexpr const char kStop[] = "stop";
+  static constexpr const char kStep[] = "step";
+  static constexpr const char kOutputTypes[] = "output_types";
+  static constexpr const char kOutputShapes[] = "output_shapes";
+
+  explicit RangeDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+
+ private:
+  class Dataset;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_RANGE_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
index 608b8e81b51..6cb3da44975 100644
--- a/tensorflow/core/kernels/data/range_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/data/range_dataset_op.h"
+
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
 
 namespace tensorflow {
@@ -20,7 +22,6 @@ namespace data {
 namespace {
 
 constexpr char kNodeName[] = "range_dataset";
-constexpr char kOpName[] = "RangeDataset";
 
 class RangeDatasetOpTest : public DatasetOpsTestBase {
  protected:
@@ -38,7 +39,7 @@ class RangeDatasetOpTest : public DatasetOpsTestBase {
 
 struct TestCase {
   int64 start;
-  int64 end;
+  int64 stop;
   int64 step;
   std::vector<Tensor> expected_outputs;
   DataTypeVector expected_output_dtypes;
@@ -49,7 +50,7 @@ struct TestCase {
 
 TestCase PositiveStepTestCase() {
   return {/*start*/ 0,
-          /*end*/ 10,
+          /*stop*/ 10,
           /*step*/ 3,
           /*expected_outputs*/
           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0}),
@@ -64,7 +65,7 @@ TestCase PositiveStepTestCase() {
 
 TestCase NegativeStepTestCase() {
   return {/*start*/ 10,
-          /*end*/ 0,
+          /*stop*/ 0,
           /*step*/ -3,
           /*expected_outputs*/
           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
@@ -79,7 +80,7 @@ TestCase NegativeStepTestCase() {
 
 TestCase ZeroStepTestCase() {
   return {/*start*/ 0,
-          /*end*/ 10,
+          /*stop*/ 10,
           /*step*/ 0,
           /*expected_outputs*/ {},
           /*expected_output_dtypes*/ {},
@@ -98,13 +99,10 @@ TEST_P(ParameterizedRangeDatasetOpTest, GetNext) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = GetParam();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -145,13 +143,10 @@ TEST_F(RangeDatasetOpTest, ZeroStep) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = ZeroStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -172,13 +167,10 @@ TEST_F(RangeDatasetOpTest, DatasetNodeName) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = PositiveStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -200,13 +192,10 @@ TEST_F(RangeDatasetOpTest, DatasetTypeString) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = PositiveStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -219,7 +208,8 @@ TEST_F(RangeDatasetOpTest, DatasetTypeString) {
                              range_dataset_context.get(), &range_dataset));
   core::ScopedUnref scoped_unref(range_dataset);
 
-  EXPECT_EQ(range_dataset->type_string(), kOpName);
+  EXPECT_EQ(range_dataset->type_string(),
+            name_utils::OpName(RangeDatasetOp::kDatasetType));
 }
 
 TEST_F(RangeDatasetOpTest, DatasetOutputDtypes) {
@@ -228,13 +218,10 @@ TEST_F(RangeDatasetOpTest, DatasetOutputDtypes) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = PositiveStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -257,13 +244,10 @@ TEST_F(RangeDatasetOpTest, DatasetOutputShapes) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = PositiveStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -286,13 +270,10 @@ TEST_P(ParameterizedRangeDatasetOpTest, Cardinality) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = GetParam();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -314,13 +295,10 @@ TEST_F(RangeDatasetOpTest, DatasetSave) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = PositiveStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -348,13 +326,10 @@ TEST_F(RangeDatasetOpTest, IteratorOutputDtypes) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = PositiveStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -384,13 +359,10 @@ TEST_F(RangeDatasetOpTest, IteratorOutputShapes) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = PositiveStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -420,13 +392,10 @@ TEST_F(RangeDatasetOpTest, IteratorOutputPrefix) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = PositiveStepTestCase();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
@@ -446,7 +415,8 @@ TEST_F(RangeDatasetOpTest, IteratorOutputPrefix) {
   TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
                                            &iterator));
 
-  EXPECT_EQ(iterator->prefix(), "Iterator::Range");
+  EXPECT_EQ(iterator->prefix(), name_utils::IteratorPrefix(
+                                    RangeDatasetOp::kDatasetType, "Iterator"));
 }
 
 TEST_P(ParameterizedRangeDatasetOpTest, Roundtrip) {
@@ -455,13 +425,10 @@ TEST_P(ParameterizedRangeDatasetOpTest, Roundtrip) {
   TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
 
   TestCase test_case = GetParam();
-  gtl::InlinedVector<TensorValue, 4> inputs;
   Tensor start = CreateTensor<int64>(TensorShape({}), {test_case.start});
-  Tensor end = CreateTensor<int64>(TensorShape({}), {test_case.end});
+  Tensor stop = CreateTensor<int64>(TensorShape({}), {test_case.stop});
   Tensor step = CreateTensor<int64>(TensorShape({}), {test_case.step});
-  inputs.emplace_back(&start);
-  inputs.emplace_back(&end);
-  inputs.emplace_back(&step);
+  gtl::InlinedVector<TensorValue, 4> inputs({&start, &stop, &step});
 
   std::unique_ptr<OpKernel> range_dataset_kernel;
   TF_ASSERT_OK(
diff --git a/tensorflow/core/kernels/data/reduce_dataset_op_test.cc b/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
new file mode 100644
index 00000000000..e28e58eff58
--- /dev/null
+++ b/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "reduce_dataset";
+constexpr char kOpName[] = "ReduceDataset";
+
+class ReduceDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Create a new `ReduceDataset` op kernel.
+  Status CreateReduceDatasetOpKernel(
+      const FunctionDefHelper::AttrValueWrapper &func,
+      const DataTypeVector &t_state, const DataTypeVector &output_types,
+      const std::vector<PartialTensorShape> &output_shapes,
+      bool use_inter_op_parallelism,
+      std::unique_ptr<OpKernel> *reduce_dataset_op_kernel) {
+    std::vector<string> components;
+    components.reserve(1 + t_state.size());
+    components.emplace_back("input_dataset");
+    for (int i = 0; i < t_state.size(); ++i) {
+      components.emplace_back(strings::StrCat("initial_state_", i));
+    }
+    NodeDef node_def = test::function::NDef(
+        kNodeName, kOpName, components,
+        {{"f", func},
+         {"Tstate", t_state},
+         {"Targuments", {}},
+         {"output_types", output_types},
+         {"output_shapes", output_shapes},
+         {"use_inter_op_parallelism", use_inter_op_parallelism}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, reduce_dataset_op_kernel));
+    return Status::OK();
+  }
+
+  // Create a new `ReduceDataset` op kernel context
+  Status CreateReduceDatasetContext(
+      OpKernel *const op_kernel,
+      gtl::InlinedVector<TensorValue, 4> *const inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+};
+
+struct RangeDatasetParam {
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+struct TestCase {
+  RangeDatasetParam range_data_param;
+  std::vector<Tensor> initial_state;
+  FunctionDefHelper::AttrValueWrapper func;
+  std::vector<FunctionDef> func_lib;
+  DataTypeVector t_state;
+  bool use_inter_op_parallelism;
+  std::vector<Tensor> expected_outputs;
+  DataTypeVector output_dtypes;
+  std::vector<PartialTensorShape> output_shapes;
+};
+
+// Test case 1: the input function has one output.
+TestCase TestCase1() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*initial_state*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0})},
+          /*func*/
+          FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::XAddY()},
+          /*t_state*/ {DT_INT64},
+          /*use_inter_op_parallelism*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {45})},
+          /*output_dtypes*/ {DT_INT64},
+          /*output_shapes*/ {PartialTensorShape({})}};
+}
+
+// Test case 2: the input function has two outputs.
+TestCase TestCase2() {
+  return {/*range_data_param*/ {0, 10, 1},
+          /*initial_state*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0})},
+          /*func*/
+          FunctionDefHelper::FunctionRef("XPlusOneXTimesY", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::XPlusOneXTimesY()},
+          /*t_state*/ {DT_INT64},
+          /*use_inter_op_parallelism*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {10}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {0})},
+          /*output_dtypes*/ {DT_INT64, DT_INT64},
+          /*output_shapes*/ {PartialTensorShape({}), PartialTensorShape({})}};
+}
+
+// Test case 3: the input dataset has no outputs, so the reduce dataset just
+// returns the initial state.
+TestCase TestCase3() {
+  return {/*range_data_param*/ {0, 0, 1},
+          /*initial_state*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3})},
+          /*func*/
+          FunctionDefHelper::FunctionRef("XAddY", {{"T", DT_INT64}}),
+          /*func_lib*/ {test::function::XAddY()},
+          /*t_state*/ {DT_INT64, DT_INT64},
+          /*use_inter_op_parallelism*/ true,
+          /*expected_outputs*/
+          {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+           DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {3})},
+          /*output_dtypes*/ {DT_INT64, DT_INT64},
+          /*output_shapes*/ {PartialTensorShape({}), PartialTensorShape({})}};
+}
+
+class ParameterizedReduceDatasetOpTest
+    : public ReduceDatasetOpTest,
+      public ::testing::WithParamInterface<TestCase> {};
+
+TEST_P(ParameterizedReduceDatasetOpTest, Compute) {
+  int thread_num = 2, cpu_num = 2;
+  TestCase test_case = GetParam();
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_case.func_lib, cpu_num));
+
+  std::unique_ptr<OpKernel> reduce_dataset_kernel;
+  TF_ASSERT_OK(CreateReduceDatasetOpKernel(
+      test_case.func, test_case.t_state, test_case.output_dtypes,
+      test_case.output_shapes, test_case.use_inter_op_parallelism,
+      &reduce_dataset_kernel));
+
+  DatasetBase *range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(
+      test_case.range_data_param.start, test_case.range_data_param.end,
+      test_case.range_data_param.step, "range", &range_dataset));
+  Tensor range_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_ASSERT_OK(
+      StoreDatasetInVariantTensor(range_dataset, &range_dataset_tensor));
+  std::vector<Tensor> initial_state = test_case.initial_state;
+  gtl::InlinedVector<TensorValue, 4> inputs({&range_dataset_tensor});
+  for (auto &t : initial_state) {
+    inputs.emplace_back(&t);
+  }
+
+  std::unique_ptr<OpKernelContext> reduce_dataset_context;
+  TF_ASSERT_OK(CreateReduceDatasetContext(reduce_dataset_kernel.get(), &inputs,
+                                          &reduce_dataset_context));
+  TF_ASSERT_OK(
+      RunOpKernel(reduce_dataset_kernel.get(), reduce_dataset_context.get()));
+
+  int num_outputs = reduce_dataset_context->num_outputs();
+  EXPECT_EQ(num_outputs, test_case.expected_outputs.size());
+  for (int i = 0; i < num_outputs; i++) {
+    // output will be released by the op kernel context.
+    Tensor *output = reduce_dataset_context->mutable_output(i);
+    TF_EXPECT_OK(ExpectEqual(test_case.expected_outputs[i], *output));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(ReduceDatasetOpTest, ParameterizedReduceDatasetOpTest,
+                         ::testing::ValuesIn(std::vector<TestCase>(
+                             {TestCase1(), TestCase2(), TestCase3()})));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index aab4bfe6403..7ed6436082a 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -207,6 +207,7 @@ class SingleThreadedExecutorImpl : public Executor {
     params.log_memory = false;              // TODO(mrry): Too severe?
     params.record_tensor_accesses = false;  // TODO(mrry): Too severe?
     params.rendezvous = args.rendezvous;
+    params.create_rendezvous = &(params_.rendezvous_factory);
     params.session_state = args.session_state;
     params.tensor_store = args.tensor_store;
     params.cancellation_manager = args.cancellation_manager;
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
index ee2619663f0..e356e8fc3f4 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
@@ -76,6 +76,12 @@ TestCase PlainTensorTestCase() {
           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
            DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2, 2}),
                                                    {1, 2, 3, 4}),
+           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({2}), {2, 3}),
+           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({2, 2}),
+                                                    {2, 3, 4, 5}),
+           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({2}), {3, 4}),
+           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({2, 2}),
+                                                    {3, 4, 5, 6}),
            DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 1}),
                                                     {37.0, 38.0}),
            DatasetOpsTestBase::CreateTensor<string>(TensorShape({2, 1}),
@@ -83,10 +89,18 @@ TestCase PlainTensorTestCase() {
           /*expected_outputs*/
           {DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
            DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
+           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({}), {2}),
+           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({2}), {2, 3}),
+           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({}), {3}),
+           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({2}), {3, 4}),
            DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {37.0}),
            DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}), {"a"}),
            DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
            DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {3, 4}),
+           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({}), {3}),
+           DatasetOpsTestBase::CreateTensor<uint32>(TensorShape({2}), {4, 5}),
+           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({}), {4}),
+           DatasetOpsTestBase::CreateTensor<uint64>(TensorShape({2}), {5, 6}),
            DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {38.0}),
            DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}), {"b"})},
           /*breakpoints*/ {0, 1, 3}};
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 2cafa44f37a..052c9f24e4b 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -41,9 +43,9 @@ enum FileFormat {
 // Classify the contents of a file based on starting bytes (the magic number).
 FileFormat ClassifyFileFormat(StringPiece data) {
   // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
-  if (str_util::StartsWith(data, "\xff\xd8\xff")) return kJpgFormat;
-  if (str_util::StartsWith(data, "\x89PNG\r\n\x1a\n")) return kPngFormat;
-  if (str_util::StartsWith(data, "\x47\x49\x46\x38")) return kGifFormat;
+  if (absl::StartsWith(data, "\xff\xd8\xff")) return kJpgFormat;
+  if (absl::StartsWith(data, "\x89PNG\r\n\x1a\n")) return kPngFormat;
+  if (absl::StartsWith(data, "\x47\x49\x46\x38")) return kGifFormat;
   return kUnknownFormat;
 }
 
@@ -58,7 +60,7 @@ string FileFormatString(FileFormat magic, StringPiece data) {
     default: {
       if (data.empty()) return "empty file";
       return strings::StrCat("unknown format starting with '",
-                             str_util::CEscape(data.substr(0, 16)), "'");
+                             absl::CEscape(data.substr(0, 16)), "'");
     }
   }
 }
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index 17eb4e24b71..ac34c4ff09f 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -154,8 +154,8 @@ TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
   AddInputFromArray<int32>(TensorShape({5}), {0, 2, 99, 2, 2});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(),
-                                    "partitions[2] = 99 is not in [0, 4)"))
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "partitions[2] = 99 is not in [0, 4)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
index 7fa6e320f57..ca8e68ec291 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
@@ -89,7 +89,7 @@ TEST_F(DynamicStitchOpTest, Error_IndicesMultiDimensional) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(),
       "data[1].shape = [5] does not start with indices[1].shape = [1,5]"))
       << s;
@@ -104,7 +104,7 @@ TEST_F(DynamicStitchOpTest, Error_DataNumDimsMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({1, 5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(),
       "data[1].shape = [1,5] does not start with indices[1].shape = [5]"))
       << s;
@@ -121,9 +121,9 @@ TEST_F(DynamicStitchOpTest, Error_DataDimSizeMismatch) {
                            {10, 11, 60, 61, 20, 21, 30, 31});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(),
-                            "Need data[0].shape[1:] = data[1].shape[1:], got "
-                            "data[0].shape = [3,1], data[1].shape = [4,2]"))
+      absl::StrContains(s.ToString(),
+                        "Need data[0].shape[1:] = data[1].shape[1:], got "
+                        "data[0].shape = [3,1], data[1].shape = [4,2]"))
       << s;
 }
 
@@ -136,7 +136,7 @@ TEST_F(DynamicStitchOpTest, Error_DataAndIndicesSizeMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({4}), {10, 60, 20, 30});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(),
       "data[1].shape = [4] does not start with indices[1].shape = [5]"))
       << s;
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 493ab2776c5..187e2d73ab1 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -173,14 +173,14 @@ struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
 
 template <typename IndexType, typename OutputMapper, bool ConjugateLhs = false,
           bool ConjugateRhs = false>
-struct mkldnn_gemm_s8s8s32_kernel {
+struct mkldnn_gemm_s8u8s32_kernel {
   static_assert(!ConjugateLhs, "MKL-DNN kernel doesn't support ConjugateLhs");
   static_assert(!ConjugateRhs, "MKL-DNN kernel doesn't support ConjugateRhs");
 
   static constexpr int kComputeStrideFromBlockDimensions = -1;
 
   using LhsScalar = Eigen::QInt8;
-  using RhsScalar = Eigen::QInt8;
+  using RhsScalar = Eigen::QUInt8;
   using ResScalar = Eigen::QInt32;
 
   EIGEN_DONT_INLINE
@@ -215,13 +215,12 @@ struct mkldnn_gemm_s8s8s32_kernel {
     const char offsetc = 'F';
     const int32_t co = 0;
 
-    const int8_t* A = reinterpret_cast<const int8_t*>(blockA);
-    const int8_t* B = reinterpret_cast<const int8_t*>(blockB);
-    int32_t* C =
-        reinterpret_cast<int32_t*>(const_cast<ResScalar*>(output.data()));
+    const auto* A = reinterpret_cast<const int8_t*>(blockA);
+    const auto* B = reinterpret_cast<const uint8_t*>(blockB);
+    auto* C = reinterpret_cast<int32_t*>(const_cast<ResScalar*>(output.data()));
 
     mkldnn_status_t st =
-        mkldnn_gemm_s8s8s32(&transposeA, &transposeB, &offsetc,  //
+        mkldnn_gemm_s8u8s32(&transposeA, &transposeB, &offsetc,  //
                             &m, &n, &k,                          //
                             &alpha,                              //
                             A, &ldA, &ao,                        //
@@ -308,6 +307,54 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
   StorageIndex nc_;
 };
 
+template <typename StorageIndex, int sharding_type>
+class TensorContractionBlocking<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
+                                StorageIndex, sharding_type> {
+  // TODO(ezhulenev): Define proper gebp_traits in Eigen for quantized types?
+
+  // Default Eigen block heuristics for `QInt8xQUInt8 -> QInt32` are wrong.
+  // Mostly because gebp_traits are not correctly defined. But we know that we
+  // are going to use s8u8s32_gemm from MKL-DNN, so we use float heuristics, and
+  // adjust them to work well with MKL-DNN.
+  using LhsScalar = Eigen::QInt8;
+  using RhsScalar = Eigen::QUInt8;
+  using ResScalar = Eigen::QInt32;
+
+  // Multiply default choice of block size along M, N and K dimensions.
+  static constexpr float kScaleM = 1.5;
+  static constexpr float kScaleN = 1.5;
+  static constexpr float kScaleK = 1.5;
+
+ public:
+  TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
+                            StorageIndex num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    // 1. Compute block sizes using default Eigen heuristics for float.
+    if (sharding_type == ShardByCol) {
+      computeProductBlockingSizes<float, float, 1>(kc_, mc_, nc_, num_threads);
+    } else {
+      computeProductBlockingSizes<float, float, 1>(kc_, nc_, mc_, num_threads);
+    }
+
+    // If dimensions do not pass basic sanity checks return immediately.
+    if (kc_ <= 0 || mc_ <= 0 || nc_ <= 0) return;
+
+    // 2. And refine them to work well with mkldnn gemm.
+    mc_ = (std::min)(m, static_cast<StorageIndex>(mc_ * kScaleM));
+    nc_ = (std::min)(n, static_cast<StorageIndex>(nc_ * kScaleN));
+    kc_ = (std::min)(k, static_cast<StorageIndex>(kc_ * kScaleK));
+  }
+
+  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
+};
+
 // If the Lhs or Rhs Tensor expressions are already evaluated and have access to
 // raw data, we can skip packing step and setup pointers and a stride to the
 // underlying memory buffer and pass them directly to Gemm.
@@ -448,10 +495,10 @@ struct GemmKernelProvider<float, float, float, StorageIndex, OutputMapper> {
 };
 
 template <typename StorageIndex, typename OutputMapper>
-struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QInt8,
+struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QUInt8,
                           StorageIndex, OutputMapper> {
   enum { Defined = 1 };
-  using GemmKernel = mkldnn_gemm_s8s8s32_kernel<StorageIndex, OutputMapper>;
+  using GemmKernel = mkldnn_gemm_s8u8s32_kernel<StorageIndex, OutputMapper>;
 };
 
 // NOTE: 'std::enable_if' doesn't work for template specializations. See
@@ -822,7 +869,7 @@ struct GemmKernelProvider<Eigen::QInt32, Eigen::QInt8, Eigen::QInt8,
 
 REGISTER_TENSOR_CONTRACTION_KERNEL_WITH_FALLBACK(float, float, float);
 REGISTER_TENSOR_CONTRACTION_KERNEL_NO_FALLBACK(Eigen::QInt32, Eigen::QInt8,
-                                               Eigen::QInt8);
+                                               Eigen::QUInt8);
 
 #undef REGISTER_TENSOR_CONTRACTION_KERNEL
 
diff --git a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
index 86938938f83..cbf525212bb 100644
--- a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
+++ b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
@@ -146,12 +146,13 @@ TEST(EigenMkldnnTest, MkldnnGemm) {
   }
 }
 
-TEST(EigenMkldnnTest, MkldnnGemmQInt8) {
+TEST(EigenMkldnnTest, MkldnnGemmQInt8xQUInt8) {
   // Mkldnn pack and gemm are used only in Tensor contractions, and it's
   // guaranteed that Tensors will have ColMajor layout.
   static const int Options = ColMajor;
 
   using Tensor2dQInt8 = Eigen::Tensor<Eigen::QInt8, 2, Options, Index>;
+  using Tensor2dQUInt8 = Eigen::Tensor<Eigen::QUInt8, 2, Options, Index>;
   using Tensor2dQInt32 = Eigen::Tensor<Eigen::QInt32, 2, Options, Index>;
 
   int m = internal::random<int>(1, 1000);
@@ -161,8 +162,11 @@ TEST(EigenMkldnnTest, MkldnnGemmQInt8) {
   Tensor2dQInt8 lhs(m, k);
   lhs.setRandom();
 
-  Tensor2dQInt8 rhs(k, n);
+  Tensor2dQUInt8 rhs(k, n);
   rhs.setRandom();
+  // NOTE: 's8*u8 + s8*u8 -> s16' saturation might lead to incorrect results. In
+  // practice in FusedConv2DBiasActivationKernel we use 7 bit inputs.
+  rhs = rhs.clip(0, 127);
 
   Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_dims;
   contract_dims[0].first = 1;
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 9b215c55af1..5152ca262de 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1375,6 +1375,7 @@ TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
   EigenApprox(8.0f, direct(0, 1, 3, 0));
 }
 
+template <typename T>
 static void PackRhsHelper(int iters,
                           /* Input dimensions: */
                           int input_batches, int input_cols, int input_rows,
@@ -1397,8 +1398,7 @@ static void PackRhsHelper(int iters,
   // starting from the inner most (channels aka depth in this case).
   Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
 
-  using Traits = typename Eigen::internal::gebp_traits<float, float>;
-  static const int packet_size = Eigen::internal::packet_traits<float>::size;
+  static const int packet_size = Eigen::internal::packet_traits<T>::size;
 
   // Reshape dimensions.
   using NewDimension = Eigen::DSizes<Index, 2>;
@@ -1407,11 +1407,11 @@ static void PackRhsHelper(int iters,
   using nocontract_t = Eigen::array<Eigen::Index, 1>;
   using contract_t = Eigen::array<Eigen::Index, 1>;
 
-  // Input to the TensorImagePatchOp. It is the tensorflow TTypes<float>::Tensor
+  // Input to the TensorImagePatchOp. It is the tensorflow TTypes<T>::Tensor
   // with ColMajor layout, instead of RowMajor. But that doesn't make any
   // difference, because TensorContraction swaps LHS with RHS for row major
   // inputs, and contraction mapper always works with column major data.
-  using ArgType = TensorMap<Tensor<float, 4>, Eigen::Aligned>;
+  using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
 
   using Evaluator = TensorEvaluator<
       const TensorReshapingOp<
@@ -1419,31 +1419,32 @@ static void PackRhsHelper(int iters,
       Eigen::DefaultDevice>;
 
   using InputMapper = Eigen::internal::TensorContractionInputMapper<
-      float, Index, Eigen::internal::Rhs, Evaluator,  //
-      nocontract_t, contract_t,                       //
-      packet_size,                                    //
-      /*inner_dim_contiguous*/ true,                  //
-      /*inner_dim_reordered*/ false,                  //
+      T, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
       /*Alignment*/ 0>;
 
   using SubMapper = Eigen::internal::TensorContractionSubMapper<
-      float, Index, Eigen::internal::Rhs, Evaluator,  //
-      nocontract_t, contract_t,                       //
-      packet_size,                                    //
-      /*inner_dim_contiguous*/ true,                  //
-      /*inner_dim_reordered*/ false,                  //
+      T, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
       /*Alignment*/ 0>;
 
 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
   using PackRhsImpl =
-      Eigen::internal::gemm_pack_colmajor_block<float, Eigen::Index, SubMapper,
+      Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
                                                 ColMajor>;
 #else
+  using Traits = typename Eigen::internal::gebp_traits<T, T>;
   using PackRhsImpl =
-      Eigen::internal::gemm_pack_rhs<float, Eigen::Index, SubMapper,  //
-                                     Traits::nr,                      //
-                                     ColMajor,                        //
-                                     /*Conjugate*/ false,             //
+      Eigen::internal::gemm_pack_rhs<T, Eigen::Index, SubMapper,  //
+                                     Traits::nr,                  //
+                                     ColMajor,                    //
+                                     /*Conjugate*/ false,         //
                                      /*PanelMode*/ false>;
 #endif
 
@@ -1455,16 +1456,16 @@ static void PackRhsHelper(int iters,
   contract_t contract_dim = {not_important};
 
   // We use tensor of the same dimensions to store packed data.
-  Tensor<float, 4> packed(input_dims);
+  Tensor<T, 4> packed(input_dims);
 
   // We generate multiple input tensors, around 512mb in total size to measure
   // realistic workload when input data in not in L1-L3 cache.
-  size_t input_bytes = input_dims.TotalSize() * sizeof(float);
+  size_t input_bytes = input_dims.TotalSize() * sizeof(T);
   size_t mem_size_bytes = 1024 * 1024 * 512;
   size_t num_inputs =
       std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
 
-  std::vector<Tensor<float, 4>> inputs;
+  std::vector<Tensor<T, 4>> inputs;
   std::vector<Evaluator> evaluators;
   std::vector<InputMapper> input_mappers;
 
@@ -1518,15 +1519,20 @@ static void PackRhsHelper(int iters,
 
   const Index packed_total_size = input_dims.TotalSize();
 
+  // Round up row/col/memory offsets to make them multiple of packet size.
+  const auto round_up = [](const Index idx) {
+    return (idx / packet_size) * packet_size;
+  };
+
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
     int input_idx =
         num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
 
-    // Depth offset must be a multiple of 8 (float packet size with AVX2).
+    // Depth offset must be a multiple packet size.
     Index depth_offset =
         (patch_size > block_rows)
-            ? (internal::random<Index>(0, patch_size - 10) / 8) * 8
+            ? round_up(internal::random<Index>(0, patch_size - 10))
             : 0;
     Index col_offset = internal::random<Index>(0, num_patches - 10);
 
@@ -1549,6 +1555,7 @@ static void PackRhsHelper(int iters,
                    " num_inputs=", num_inputs));
 }
 
+template <typename T>
 static void PackLhsHelper(int iters,
                           /* Input dimensions: */
                           int input_depth,
@@ -1571,7 +1578,7 @@ static void PackLhsHelper(int iters,
   // starting from the inner most (`filter count` aka `kernel filers`).
   Dimensions filter_dims(filter_count, filter_rows, filter_cols, input_depth);
 
-  static const int packet_size = Eigen::internal::packet_traits<float>::size;
+  static const int packet_size = Eigen::internal::packet_traits<T>::size;
 
   // We are going to reshape filter into 2D tensor.
   using NewDimension = Eigen::DSizes<Index, 2>;
@@ -1580,40 +1587,40 @@ static void PackLhsHelper(int iters,
   using nocontract_t = Eigen::array<Eigen::Index, 1>;
   using contract_t = Eigen::array<Eigen::Index, 1>;
 
-  // Input to the ReshapeOp. It is the tensorflow TTypes<float>::Tensor
+  // Input to the ReshapeOp. It is the tensorflow TTypes<T>::Tensor
   // with ColMajor layout, instead of RowMajor. But that doesn't make any
   // difference, because TensorContraction swaps LHS with RHS for row major
   // inputs, and contraction mapper always works with column major data.
-  using ArgType = TensorMap<Tensor<float, 4>, Eigen::Aligned>;
+  using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
 
   using Evaluator =
       TensorEvaluator<const TensorReshapingOp<NewDimension, ArgType>,
                       Eigen::DefaultDevice>;
 
   using InputMapper = Eigen::internal::TensorContractionInputMapper<
-      float, Index, Eigen::internal::Lhs, Evaluator,  //
-      nocontract_t, contract_t,                       //
-      packet_size,                                    //
-      /*inner_dim_contiguous*/ true,                  //
-      /*inner_dim_reordered*/ false,                  //
+      T, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
       /*Alignment*/ 0>;
 
   using SubMapper = Eigen::internal::TensorContractionSubMapper<
-      float, Index, Eigen::internal::Lhs, Evaluator,  //
-      nocontract_t, contract_t,                       //
-      packet_size,                                    //
-      /*inner_dim_contiguous*/ true,                  //
-      /*inner_dim_reordered*/ false,                  //
+      T, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                   //
+      packet_size,                                //
+      /*inner_dim_contiguous*/ true,              //
+      /*inner_dim_reordered*/ false,              //
       /*Alignment*/ 0>;
 
 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
   using PackLhsImpl =
-      Eigen::internal::gemm_pack_colmajor_block<float, Eigen::Index, SubMapper,
+      Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
                                                 ColMajor>;
 #else
-  using Traits = typename Eigen::internal::gebp_traits<float, float>;
+  using Traits = typename Eigen::internal::gebp_traits<T, T>;
   using PackLhsImpl =
-      Eigen::internal::gemm_pack_lhs<float, Eigen::Index, SubMapper,      //
+      Eigen::internal::gemm_pack_lhs<T, Eigen::Index, SubMapper,          //
                                      Traits::mr,                          //
                                      Traits::LhsProgress,                 //
                                      typename Traits::LhsPacket4Packing,  //
@@ -1639,16 +1646,16 @@ static void PackLhsHelper(int iters,
   contract_t k_strides = {1};
 
   // We use tensor of the same dimensions to store packed data.
-  Tensor<float, 4> packed(filter_dims);
+  Tensor<T, 4> packed(filter_dims);
 
   // We generate multiple filter tensors, around 512mb in total size to measure
   // realistic workload when input data in not in L1-L3 cache.
-  size_t input_bytes = filter_dims.TotalSize() * sizeof(float);
+  size_t input_bytes = filter_dims.TotalSize() * sizeof(T);
   size_t mem_size_bytes = 1024 * 1024 * 512;
   size_t num_filters =
       std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
 
-  std::vector<Tensor<float, 4>> filters;
+  std::vector<Tensor<T, 4>> filters;
   std::vector<Evaluator> evaluators;
   std::vector<InputMapper> input_mappers;
 
@@ -1733,23 +1740,27 @@ static void PackLhsHelper(int iters,
 
 #define BM_CONCAT(a, b) a##b
 
-#define BM_RHS_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)       \
-  BM_CONCAT(BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \
-            _s##SH##x##SW##_B##BR##x##BC)
+#define BM_RHS_NAME(prefix, T, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)    \
+  BM_CONCAT(                                                              \
+      BM_##prefix##_##T##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \
+      _s##SH##x##SW##_B##BR##x##BC)
 
-#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)             \
-  static void BM_RHS_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
-                          BC)(int iters) {                             \
-    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);      \
-  }                                                                    \
-  BENCHMARK(BM_RHS_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
+#define BM_PackRhs(T, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)             \
+  static void BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, SH, SW, BR, \
+                          BC)(int iters) {                                \
+    PackRhsHelper<T>(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);      \
+  }                                                                       \
+  BENCHMARK(BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
 
+using qint8 = Eigen::QInt8;
+
 // NOTE: This is the most common case in Tensorflow models.
 // Fast path: input channel dimension is the multiple of the packet size.
-BM_PackRhs(/*batch*/ 32,        //
+BM_PackRhs(/*type*/ float,      //
+           /*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 32,     //
            /*num_filters*/ 64,  //
@@ -1757,7 +1768,8 @@ BM_PackRhs(/*batch*/ 32,        //
            /*stride*/ 1, 1,     //
            /*block*/ 256, 56);
 
-BM_PackRhs(/*batch*/ 32,        //
+BM_PackRhs(/*type*/ float,      //
+           /*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 32,     //
            /*num_filters*/ 64,  //
@@ -1766,7 +1778,8 @@ BM_PackRhs(/*batch*/ 32,        //
            /*block*/ 256, 56);
 
 // Slow path: input channel dimension is not the multiple of the packet size.
-BM_PackRhs(/*batch*/ 32,        //
+BM_PackRhs(/*type*/ float,      //
+           /*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 30,     //
            /*num_filters*/ 64,  //
@@ -1774,7 +1787,8 @@ BM_PackRhs(/*batch*/ 32,        //
            /*stride*/ 1, 1,     //
            /*block*/ 256, 56);
 
-BM_PackRhs(/*batch*/ 32,        //
+BM_PackRhs(/*type*/ float,      //
+           /*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 30,     //
            /*num_filters*/ 64,  //
@@ -1783,7 +1797,8 @@ BM_PackRhs(/*batch*/ 32,        //
            /*block*/ 256, 56);
 
 // Slow path with input channel dimension smaller than the packet size.
-BM_PackRhs(/*batch*/ 32,        //
+BM_PackRhs(/*type*/ float,      //
+           /*batch*/ 32,        //
            /*image*/ 256, 256,  //
            /*channels*/ 4,      //
            /*num_filters*/ 16,  //
@@ -1791,7 +1806,8 @@ BM_PackRhs(/*batch*/ 32,        //
            /*stride*/ 1, 1,     //
            /*block*/ 256, 56);
 
-BM_PackRhs(/*batch*/ 32,        //
+BM_PackRhs(/*type*/ float,      //
+           /*batch*/ 32,        //
            /*image*/ 256, 256,  //
            /*channels*/ 4,      //
            /*num_filters*/ 16,  //
@@ -1800,7 +1816,8 @@ BM_PackRhs(/*batch*/ 32,        //
            /*block*/ 256, 56);
 
 // Short and wide block with small input channel dimension.
-BM_PackRhs(/*batch*/ 32,        //
+BM_PackRhs(/*type*/ float,      //
+           /*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 4,      //
            /*num_filters*/ 16,  //
@@ -1808,7 +1825,8 @@ BM_PackRhs(/*batch*/ 32,        //
            /*stride*/ 1, 1,     //
            /*block*/ 36, 432);
 
-BM_PackRhs(/*batch*/ 32,        //
+BM_PackRhs(/*type*/ float,      //
+           /*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 4,      //
            /*num_filters*/ 16,  //
@@ -1816,6 +1834,15 @@ BM_PackRhs(/*batch*/ 32,        //
            /*stride*/ 2, 2,     //
            /*block*/ 36, 432);
 
+BM_PackRhs(/*type*/ qint8,      //
+           /*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 32,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
+
 // -------------------------------------------------------------------------- //
 // Pack LHS
 //
@@ -1827,34 +1854,38 @@ BM_PackRhs(/*batch*/ 32,        //
 //   BR: block rows
 //   BC: block cols
 
-#define BM_LHS_NAME(prefix, C, FC, FH, FW, BR, BC) \
-  BM_CONCAT(BM_##prefix##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
+#define BM_LHS_NAME(prefix, T, C, FC, FH, FW, BR, BC) \
+  BM_CONCAT(BM_##prefix##_##T##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
 
-#define BM_PackLhs(C, FC, FH, FW, BR, BC)                              \
-  static void BM_LHS_NAME(PackLhs, C, FC, FH, FW, BR, BC)(int iters) { \
-    PackLhsHelper(iters, C, FC, FH, FW, BR, BC);                       \
-  }                                                                    \
-  BENCHMARK(BM_LHS_NAME(PackLhs, C, FC, FH, FW, BR, BC))
+#define BM_PackLhs(T, C, FC, FH, FW, BR, BC)                              \
+  static void BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC)(int iters) { \
+    PackLhsHelper<T>(iters, C, FC, FH, FW, BR, BC);                       \
+  }                                                                       \
+  BENCHMARK(BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
 
-BM_PackLhs(/*input channels*/ 128,    //
+BM_PackLhs(/*type*/ float,            //
+           /*input channels*/ 128,    //
            /*filter channels*/ 1024,  //
            /*filter dims*/ 3, 3,      //
            /*block*/ 256, 56);
 
-BM_PackLhs(/*input channels*/ 128,    //
+BM_PackLhs(/*type*/ float,            //
+           /*input channels*/ 128,    //
            /*filter channels*/ 1024,  //
            /*filter dims*/ 3, 3,      //
            /*block*/ 56, 256);
 
-BM_PackLhs(/*input channels*/ 30,   //
+BM_PackLhs(/*type*/ float,          //
+           /*input channels*/ 30,   //
            /*filter channels*/ 64,  //
            /*filter dims*/ 3, 3,    //
            /*block*/ 256, 56);
 
-BM_PackLhs(/*input channels*/ 50,   //
+BM_PackLhs(/*type*/ float,          //
+           /*input channels*/ 50,   //
            /*filter channels*/ 64,  //
            /*filter dims*/ 3, 3,    //
            /*block*/ 56, 256);
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
index 3bb9542b4dd..4d843ab02cc 100644
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <mutex>
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
@@ -136,9 +135,9 @@ ExampleTensorMap ExampleStore<T>::serialized_example;
 template <typename T>
 std::once_flag ExampleStore<T>::flags_init;
 
-template class ExampleStore<BytesFiller>;
-template class ExampleStore<Int64Filler>;
-template class ExampleStore<FloatFiller>;
+template struct ExampleStore<BytesFiller>;
+template struct ExampleStore<Int64Filler>;
+template struct ExampleStore<FloatFiller>;
 
 enum BenchmarkType { kDense, kSparse, kVarLenDense };
 
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.cc b/tensorflow/core/kernels/extract_volume_patches_op.cc
index 8107bca7d18..904d2a8ac26 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/extract_volume_patches_op.cc
@@ -158,7 +158,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER);
 
 #undef REGISTER
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
@@ -192,6 +192,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 
 #undef REGISTER
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc b/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc
index c6364936027..df8b6f8bfa2 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/extract_volume_patches_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -35,4 +35,4 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 37f2bd309b5..33bed217003 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -262,6 +262,13 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
+    profiler::TraceMe trace_me(
+        [&] {
+          return absl::StrCat(
+              "SymbolicGradientOp #parent_step_id=", ctx->step_id(),
+              ",function_step_id=", opts.step_id, "#");
+        },
+        /*level=*/2);
     lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
       if (!status.ok()) {
         ctx->SetStatus(status);
@@ -385,6 +392,12 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
       profiler::TraceMeLevel::kInfo);
   VLOG(1) << "Running " << func_.name() << " on " << target_device
           << " with handle: " << handle;
+  profiler::TraceMe trace_me(
+      [&] {
+        return absl::StrCat("RemoteCallOp #parent_step_id=", ctx->step_id(),
+                            ",function_step_id=", opts.step_id, "#");
+      },
+      /*level=*/2);
   lib->Run(opts, handle, args, rets,
            [rets, activity, done, ctx](const Status& status) {
              if (!status.ok()) {
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 8792387f99d..52ec30080cf 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
@@ -182,6 +183,12 @@ class IfOp : public AsyncOpKernel {
     void Start() {
       FHandle handle = cond_ ? then_handle_ : else_handle_;
       rets_.clear();
+      profiler::TraceMe trace_me(
+          [&] {
+            return absl::StrCat("IfOp #parent_step_id=", ctx_->step_id(),
+                                ",function_step_id=", opts_.step_id, "#");
+          },
+          /*level=*/2);
       lib_->Run(
           // Evaluate one of the branch.
           opts_, handle, args_, &rets_,
@@ -275,6 +282,12 @@ class CaseOp : public AsyncOpKernel {
         branch = branch_handles_.size() - 1;
       }
       rets_.clear();
+      profiler::TraceMe trace_me(
+          [&] {
+            return absl::StrCat("CaseOp #parent_step_id=", ctx_->step_id(),
+                                ",function_step_id=", opts_.step_id, "#");
+          },
+          /*level=*/2);
       lib_->Run(
           // Evaluate one of the branch.
           opts_, branch_handles_[branch], args_, &rets_,
@@ -383,6 +396,13 @@ class WhileOp : public AsyncOpKernel {
     TensorVec rets_;
 
     void EvalCond() {
+      profiler::TraceMe trace_me(
+          [&] {
+            return absl::StrCat(
+                "WhileOp-EvalCond #parent_step_id=", ctx_->step_id(),
+                ",function_step_id=", opts_.step_id, "#");
+          },
+          /*level=*/2);
       lib_->Run(
           // Evaluate the condition.
           opts_, cond_handle_, args_, &rets_,
@@ -443,6 +463,13 @@ class WhileOp : public AsyncOpKernel {
         return Finish(Status::OK());
       }
       rets_.clear();
+      profiler::TraceMe trace_me(
+          [&] {
+            return absl::StrCat(
+                "WhileOp-StartBody #parent_step_id=", ctx_->step_id(),
+                ",function_step_id=", opts_.step_id, "#");
+          },
+          /*level=*/2);
       lib_->Run(
           // Evaluate the body.
           opts_, body_handle_, args_, &rets_,
@@ -593,6 +620,12 @@ class ForOp : public AsyncOpKernel {
         args_[1 + i] = std::move(rets_[i]);
       }
       rets_.clear();
+      profiler::TraceMe trace_me(
+          [&] {
+            return absl::StrCat("ForOp #parent_step_id=", ctx_->step_id(),
+                                ",function_step_id=", opts_.step_id, "#");
+          },
+          /*level=*/2);
       lib_->Run(opts_, kernel_->body_handle_, args_, &rets_,
                 [this](const Status& s) {
                   if (s.ok()) {
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 40a58defe72..0b510fb4b62 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -17,9 +17,11 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
-#endif
+#endif  // GOOGLE_CUDA
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -44,6 +46,149 @@ struct FusedBatchNorm;
 template <typename Device, typename T, typename U>
 struct FusedBatchNormGrad;
 
+#if GOOGLE_CUDA
+using se::DeviceMemory;
+using se::ScratchAllocator;
+using se::Stream;
+using se::port::StatusOr;
+
+template <typename U, typename T>
+DeviceMemory<U> CastDeviceMemory(Tensor* tensor) {
+  return DeviceMemory<U>::MakeFromByteSize(
+      tensor->template flat<T>().data(),
+      tensor->template flat<T>().size() * sizeof(T));
+}
+
+// A helper to allocate temporary scratch memory for Cudnn BatchNormEx ops. It
+// takes the ownership of the underlying memory. The expectation is that the
+// memory should be alive for the span of the Cudnn BatchNormEx itself.
+template <typename T>
+class CudnnBatchNormAllocatorInTemp : public ScratchAllocator {
+ public:
+  ~CudnnBatchNormAllocatorInTemp() override = default;
+
+  explicit CudnnBatchNormAllocatorInTemp(OpKernelContext* context)
+      : context_(context) {}
+
+  int64 GetMemoryLimitInBytes(Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
+                                              int64 byte_size) override {
+    Tensor temporary_memory;
+    const DataType tf_data_type = DataTypeToEnum<T>::v();
+    int64 allocate_count =
+        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
+    Status allocation_status(context_->allocate_temp(
+        tf_data_type, TensorShape({allocate_count}), &temporary_memory));
+    if (!allocation_status.ok()) {
+      return allocation_status;
+    }
+    // Hold the reference of the allocated tensors until the end of the
+    // allocator.
+    allocated_tensors_.push_back(temporary_memory);
+    total_byte_size_ += byte_size;
+    return DeviceMemory<uint8>::MakeFromByteSize(
+        temporary_memory.template flat<T>().data(),
+        temporary_memory.template flat<T>().size() * sizeof(T));
+  }
+
+  int64 TotalByteSize() const { return total_byte_size_; }
+
+  Tensor get_allocated_tensor(int index) const {
+    return allocated_tensors_[index];
+  }
+
+ private:
+  int64 total_byte_size_ = 0;
+  OpKernelContext* context_;  // not owned
+  std::vector<Tensor> allocated_tensors_;
+};
+
+// A helper to allocate memory for Cudnn BatchNormEx as a kernel output. It is
+// used by forward pass kernel to feed the output to the backward pass.
+// The memory is expected to live long enough after the backward pass is
+// finished.
+template <typename T>
+class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
+ public:
+  ~CudnnBatchNormAllocatorInOutput() override {
+    if (!output_allocated) {
+      Tensor* dummy_reserve_space = nullptr;
+      OP_REQUIRES_OK(context_, context_->allocate_output(output_index_, {},
+                                                         &dummy_reserve_space));
+    }
+  }
+
+  CudnnBatchNormAllocatorInOutput(OpKernelContext* context, int output_index)
+      : context_(context), output_index_(output_index) {}
+
+  int64 GetMemoryLimitInBytes(Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(Stream* stream,
+                                              int64 byte_size) override {
+    output_allocated = true;
+    DCHECK(total_byte_size_ == 0)
+        << "Reserve space allocator can only be called once";
+    int64 allocate_count =
+        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
+
+    Tensor* temporary_memory = nullptr;
+    Status allocation_status(context_->allocate_output(
+        output_index_, TensorShape({allocate_count}), &temporary_memory));
+    if (!allocation_status.ok()) {
+      return allocation_status;
+    }
+    total_byte_size_ += byte_size;
+    auto memory_uint8 = DeviceMemory<uint8>::MakeFromByteSize(
+        temporary_memory->template flat<T>().data(),
+        temporary_memory->template flat<T>().size() * sizeof(T));
+    return StatusOr<DeviceMemory<uint8>>(memory_uint8);
+  }
+
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 total_byte_size_ = 0;
+  OpKernelContext* context_;  // not owned
+  int output_index_;
+  bool output_allocated = false;
+};
+#else
+// A dummy class for the non-GPU environment. Its child classes
+// CudnnBatchNormAllocatorInTemp and CudnnBatchNormAllocatorInOutput are used
+// to make the non-GPU operations compatible with GPU ones.
+class ScratchAllocator {
+ public:
+  virtual ~ScratchAllocator() {}
+};
+
+template <typename T>
+class CudnnBatchNormAllocatorInTemp : public ScratchAllocator {
+ public:
+  explicit CudnnBatchNormAllocatorInTemp(OpKernelContext* context) {}
+};
+
+template <typename T>
+class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
+ public:
+  ~CudnnBatchNormAllocatorInOutput() override {
+    Tensor* dummy_reserve_space = nullptr;
+    OP_REQUIRES_OK(context_, context_->allocate_output(output_index_, {},
+                                                       &dummy_reserve_space));
+  }
+  CudnnBatchNormAllocatorInOutput(OpKernelContext* context, int output_index)
+      : context_(context), output_index_(output_index) {}
+
+ private:
+  OpKernelContext* context_;  // not owned
+  int output_index_;
+};
+#endif  // GOOGLE_CUDA
+
 template <bool IsSame, typename Y, typename X, typename T>
 struct CastIfNecessary {
   static inline void process(
@@ -71,7 +216,8 @@ struct FusedBatchNorm<CPUDevice, T, U> {
                   Tensor* y_output, Tensor* batch_mean_output,
                   Tensor* batch_var_output, Tensor* saved_mean_output,
                   Tensor* saved_var_output, TensorFormat tensor_format,
-                  bool is_training) {
+                  ScratchAllocator* reserve_space_allocator,
+                  ScratchAllocator* workspace_allocator, bool is_training) {
     OP_REQUIRES(context, tensor_format == FORMAT_NHWC,
                 errors::Internal("The CPU implementation of FusedBatchNorm "
                                  "only supports NHWC tensor format for now."));
@@ -157,6 +303,8 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
                   const Tensor& mean_input, const Tensor& variance_input,
                   U epsilon, Tensor* x_backprop_output,
                   Tensor* scale_backprop_output, Tensor* offset_backprop_output,
+                  const Tensor* reserve_space,
+                  ScratchAllocator* workspace_allocator,
                   TensorFormat tensor_format) {
     OP_REQUIRES(context, tensor_format == FORMAT_NHWC,
                 errors::Internal("The CPU implementation of FusedBatchNormGrad "
@@ -245,7 +393,8 @@ struct FusedBatchNorm<GPUDevice, T, U> {
                   const Tensor& estimated_variance, U epsilon, Tensor* y,
                   Tensor* batch_mean, Tensor* batch_var, Tensor* saved_mean,
                   Tensor* saved_inv_var, TensorFormat tensor_format,
-                  bool is_training) {
+                  ScratchAllocator* reserve_space_allocator,
+                  ScratchAllocator* workspace_allocator, bool is_training) {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
 
@@ -272,6 +421,9 @@ struct FusedBatchNorm<GPUDevice, T, U> {
 
     // If input is empty, return NaN mean/variance
     if (x.shape().num_elements() == 0) {
+      Tensor* dummy_reserve_space = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(5, {}, &dummy_reserve_space));
       functor::SetNanFunctor<U> f;
       f(context->eigen_device<GPUDevice>(), batch_mean->flat<U>());
       f(context->eigen_device<GPUDevice>(), batch_var->flat<U>());
@@ -377,7 +529,8 @@ struct FusedBatchNorm<GPUDevice, T, U> {
                 static_cast<double>(epsilon), &y_ptr, &batch_mean_ptr,
                 &batch_var_ptr, &saved_mean_ptr, &saved_inv_var_ptr,
                 is_training, std::move(var_to_inv_var),
-                std::move(inv_var_to_var))
+                std::move(inv_var_to_var), reserve_space_allocator,
+                workspace_allocator)
             .ok();
 
     if (!cudnn_launch_status) {
@@ -401,6 +554,8 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
                   const Tensor& x, const Tensor& scale, const Tensor& mean,
                   const Tensor& inv_variance, U epsilon, Tensor* x_backprop,
                   Tensor* scale_backprop, Tensor* offset_backprop,
+                  const Tensor* reserve_space,
+                  ScratchAllocator* workspace_allocator,
                   TensorFormat tensor_format) {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
@@ -496,12 +651,19 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
 
     // the cudnn kernel outputs inverse variance in forward and reuse it in
     // backward
+    DeviceMemory<uint8>* reserve_space_data = nullptr;
+    if (reserve_space != nullptr && reserve_space->dims() != 0) {
+      auto reserve_space_uint8 = functor::CastDeviceMemory<uint8, U>(
+          const_cast<Tensor*>(reserve_space));
+      reserve_space_data = &reserve_space_uint8;
+    }
     bool cudnn_launch_status =
         stream
             ->ThenBatchNormalizationBackward(
                 y_backprop_ptr, x_ptr, scale_ptr, mean_ptr, inv_variance_ptr,
                 x_desc, scale_offset_desc, static_cast<double>(epsilon),
-                &x_backprop_ptr, &scale_backprop_ptr, &offset_backprop_ptr)
+                &x_backprop_ptr, &scale_backprop_ptr, &offset_backprop_ptr,
+                reserve_space_data, workspace_allocator)
             .ok();
 
     if (!cudnn_launch_status) {
@@ -536,9 +698,10 @@ DECLARE_GPU_SPEC(Eigen::half, float);
 }  // namespace functor
 
 template <typename Device, typename T, typename U>
-class FusedBatchNormOp : public OpKernel {
- public:
-  explicit FusedBatchNormOp(OpKernelConstruction* context) : OpKernel(context) {
+class FusedBatchNormOpBase : public OpKernel {
+ protected:
+  explicit FusedBatchNormOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {
     float epsilon;
     OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
     epsilon_ = U(epsilon);
@@ -549,7 +712,11 @@ class FusedBatchNormOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
   }
 
-  void Compute(OpKernelContext* context) override {
+  // If use_reserved_space is true, we need to handle the 5th output (a reserved
+  // space) and a new cudnn batch norm will be called if the version > 7.4.2.
+  // If use_reserved_space is false, we don't have 5th output.
+  virtual void ComputeWithReservedSpace(OpKernelContext* context,
+                                        bool use_reserved_space) {
     const Tensor& x = context->input(0);
     const Tensor& scale = context->input(1);
     const Tensor& offset = context->input(2);
@@ -599,10 +766,22 @@ class FusedBatchNormOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(4, scale.shape(),
                                                      &saved_maybe_inv_var));
 
-    functor::FusedBatchNorm<Device, T, U>()(
-        context, x, scale, offset, estimated_mean, estimated_variance, epsilon_,
-        y, batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
-        tensor_format_, is_training_);
+    if (!use_reserved_space) {
+      functor::FusedBatchNorm<Device, T, U>()(
+          context, x, scale, offset, estimated_mean, estimated_variance,
+          epsilon_, y, batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
+          tensor_format_, nullptr, nullptr, is_training_);
+    } else {
+      functor::CudnnBatchNormAllocatorInOutput<U> reserve_space_allocator(
+          context, 5);
+      functor::CudnnBatchNormAllocatorInTemp<uint8> workspace_allocator(
+          context);
+      functor::FusedBatchNorm<Device, T, U>()(
+          context, x, scale, offset, estimated_mean, estimated_variance,
+          epsilon_, y, batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
+          tensor_format_, &reserve_space_allocator, &workspace_allocator,
+          is_training_);
+    }
   }
 
  private:
@@ -612,9 +791,32 @@ class FusedBatchNormOp : public OpKernel {
 };
 
 template <typename Device, typename T, typename U>
-class FusedBatchNormGradOp : public OpKernel {
+class FusedBatchNormOp : public FusedBatchNormOpBase<Device, T, U> {
  public:
-  explicit FusedBatchNormGradOp(OpKernelConstruction* context)
+  explicit FusedBatchNormOp(OpKernelConstruction* context)
+      : FusedBatchNormOpBase<Device, T, U>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    FusedBatchNormOpBase<Device, T, U>::ComputeWithReservedSpace(context,
+                                                                 false);
+  }
+};
+
+template <typename Device, typename T, typename U>
+class FusedBatchNormOpV3 : public FusedBatchNormOpBase<Device, T, U> {
+ public:
+  explicit FusedBatchNormOpV3(OpKernelConstruction* context)
+      : FusedBatchNormOpBase<Device, T, U>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    FusedBatchNormOpBase<Device, T, U>::ComputeWithReservedSpace(context, true);
+  }
+};
+
+template <typename Device, typename T, typename U>
+class FusedBatchNormGradOpBase : public OpKernel {
+ protected:
+  explicit FusedBatchNormGradOpBase(OpKernelConstruction* context)
       : OpKernel(context) {
     float epsilon;
     OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
@@ -626,7 +828,8 @@ class FusedBatchNormGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
   }
 
-  void Compute(OpKernelContext* context) override {
+  virtual void ComputeWithReservedSpace(OpKernelContext* context,
+                                        bool use_reserved_space) {
     const Tensor& y_backprop = context->input(0);
     const Tensor& x = context->input(1);
     const Tensor& scale = context->input(2);
@@ -689,12 +892,26 @@ class FusedBatchNormGradOp : public OpKernel {
       return;
     }
 
+    const Tensor* reserve_space_data = nullptr;
+    functor::CudnnBatchNormAllocatorInTemp<uint8>* workspace_allocator_ptr =
+        nullptr;
+
+#if CUDNN_VERSION >= 7402
+    if (use_reserved_space) {
+      const Tensor& reserve_space = context->input(5);
+      reserve_space_data = &reserve_space;
+      functor::CudnnBatchNormAllocatorInTemp<uint8> workspace_allocator(
+          context);
+      workspace_allocator_ptr = &workspace_allocator;
+    }
+#endif  // CUDNN_VERSION >= 7402
+
     if (is_training_) {
       functor::FusedBatchNormGrad<Device, T, U>()(
           context, y_backprop, x, scale, saved_mean_or_pop_mean,
           saved_maybe_inv_var_or_pop_var, epsilon_, x_backprop, scale_backprop,
-          offset_backprop, tensor_format_);
-
+          offset_backprop, reserve_space_data, workspace_allocator_ptr,
+          tensor_format_);
     } else {
       // Necessary layout conversion is currently done in python.
       CHECK(tensor_format_ == FORMAT_NHWC)
@@ -722,6 +939,30 @@ class FusedBatchNormGradOp : public OpKernel {
   bool is_training_;
 };
 
+template <typename Device, typename T, typename U>
+class FusedBatchNormGradOp : public FusedBatchNormGradOpBase<Device, T, U> {
+ public:
+  explicit FusedBatchNormGradOp(OpKernelConstruction* context)
+      : FusedBatchNormGradOpBase<Device, T, U>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    FusedBatchNormGradOpBase<Device, T, U>::ComputeWithReservedSpace(context,
+                                                                     false);
+  }
+};
+
+template <typename Device, typename T, typename U>
+class FusedBatchNormGradOpV3 : public FusedBatchNormGradOpBase<Device, T, U> {
+ public:
+  explicit FusedBatchNormGradOpV3(OpKernelConstruction* context)
+      : FusedBatchNormGradOpBase<Device, T, U>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    FusedBatchNormGradOpBase<Device, T, U>::ComputeWithReservedSpace(context,
+                                                                     true);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(
     Name("FusedBatchNorm").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     FusedBatchNormOp<CPUDevice, float, float>);
@@ -754,6 +995,30 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOp<CPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOpV3<CPUDevice, float, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormGradOpV3<CPUDevice, float, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOpV3<CPUDevice, Eigen::half, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormGradOpV3<CPUDevice, Eigen::half, float>);
+
 #if GOOGLE_CUDA
 
 REGISTER_KERNEL_BUILDER(
@@ -788,6 +1053,30 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOp<GPUDevice, Eigen::half, float>);
 
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOpV3<GPUDevice, float, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormGradOpV3<GPUDevice, float, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV3")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOpV3<GPUDevice, Eigen::half, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV3")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormGradOpV3<GPUDevice, Eigen::half, float>);
+
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 3c3e9bfa2e0..28c1be5dbad 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -1,7 +1,6 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 cc_library(
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index b26f0a7528d..ff7022845d5 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/gather_functor.h"
@@ -30,6 +31,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::DenseIndex IndexType;
 
 template <typename Device, typename T, typename Index>
 class GatherOp : public OpKernel {
@@ -39,7 +41,14 @@ class GatherOp : public OpKernel {
   //   we have the framework do some sort of integer promotion
   //   automatically, or should that be something that users have to
   //   do explicitly with a conversion operator in the graph?
-  explicit GatherOp(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit GatherOp(OpKernelConstruction* c) : OpKernel(c) {
+    // Set batch_dims_ to 0 if the attribute does not exist.
+    if (c->HasAttr("batch_dims")) {
+      OP_REQUIRES_OK(c, c->GetAttr("batch_dims", &batch_dims_));
+    } else {
+      batch_dims_ = 0;
+    }
+  }
 
   void Compute(OpKernelContext* c) override {
     const Tensor& params = c->input(0);
@@ -51,7 +60,9 @@ class GatherOp : public OpKernel {
     // GatherV2 added an axis argument. For backwards compatibility with Gather,
     // fall back to axis 0 if the op does not have an axis input.
     int64 axis = 0;
+    bool axis_is_set = false;  // Indicates whether the axis argument was set.
     if (c->num_inputs() == 3) {
+      axis_is_set = true;
       const Tensor& axis_tensor = c->input(2);
       OP_REQUIRES(c, TensorShapeUtils::IsScalar(axis_tensor.shape()),
                   errors::InvalidArgument("axis must be scalar"));
@@ -70,12 +81,37 @@ class GatherOp : public OpKernel {
         c, axis >= -params.dims() && axis < params.dims(),
         errors::InvalidArgument("Expected axis in the range [", -params.dims(),
                                 ", ", params.dims(), "), but got ", axis));
+
     if (axis < 0) {
       axis = params.dims() + axis;
     }
 
+    if (batch_dims_ != 0) {
+      if (batch_dims_ < 0) {
+        batch_dims_ = indices.dims() + batch_dims_;
+      }
+
+      if (!axis_is_set) axis = batch_dims_;
+
+      OP_REQUIRES(
+          c, batch_dims_ >= -indices.dims() && batch_dims_ < indices.dims(),
+          errors::InvalidArgument("Expected batch_dims in the range [",
+                                  -indices.dims(), ", ", indices.dims(),
+                                  "), but got ", batch_dims_));
+
+      OP_REQUIRES(c, batch_dims_ < params.dims(),
+                  errors::InvalidArgument("batch_dims (", batch_dims_,
+                                          ") must be less than rank(params) (",
+                                          params.dims(), ")."));
+
+      OP_REQUIRES(c, axis >= batch_dims_,
+                  errors::InvalidArgument("batch_dims (", batch_dims_,
+                                          ") must be less than or equal to ",
+                                          "axis (", axis, ")."));
+    }
+
     // Check that we have enough index space
-    const int64 gather_dim_size = params.dim_size(axis);
+    int64 gather_dim_size = params.dim_size(axis);
     const int64 N = indices.NumElements();
     OP_REQUIRES(
         c, gather_dim_size <= std::numeric_limits<Index>::max(),
@@ -84,7 +120,7 @@ class GatherOp : public OpKernel {
                                 " indexing: ", gather_dim_size, " > ",
                                 std::numeric_limits<Index>::max()));
 
-    // The result shape is params.shape[0:axis] + indices.shape +
+    // The result shape is params.shape[:axis] + indices.shape[batch_dims:] +
     // params.shape[axis + 1:].
     TensorShape result_shape;
     int64 outer_size = 1;
@@ -93,7 +129,9 @@ class GatherOp : public OpKernel {
       result_shape.AddDim(params.dim_size(i));
       outer_size *= params.dim_size(i);
     }
-    result_shape.AppendShape(indices.shape());
+    for (int i = batch_dims_; i < indices.dims(); ++i) {
+      result_shape.AddDim(indices.dim_size(i));
+    }
     for (int i = axis + 1; i < params.dims(); i++) {
       result_shape.AddDim(params.dim_size(i));
       inner_size *= params.dim_size(i);
@@ -101,14 +139,55 @@ class GatherOp : public OpKernel {
 
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
-    if (N > 0 && outer_size > 0 && inner_size > 0) {
+    if (N == 0) return;
+
+    if (batch_dims_ > 0) {
+      // TODO(virimia): Switch to transpose / gather with axis=0 / transpose
+      // on GPU, to avoid launching a lot of small kernels.
+
+      // To avoid copying params (by transposing), run gather for each batch.
+      int64 batch_size = 1;
+      for (int i = 0; i < batch_dims_; ++i) {
+        batch_size *= params.dim_size(i);
+      }
+      outer_size /= batch_size;
+      auto batched_params =
+          params.shaped<T, 2>({batch_size, params.NumElements() / batch_size});
+      auto batched_indices =
+          indices.shaped<Index, 2>({batch_size, N / batch_size});
+      auto batched_out =
+          out->shaped<T, 2>({batch_size, out->NumElements() / batch_size});
+
+      // TODO(virimia): Investigate the best performance, when the number of
+      // batches is large, between parallel vs sequential runs.
+      for (int64 batch = 0; batch < batch_size; ++batch) {
+        auto params_flat = typename TTypes<T, 3>::ConstTensor(
+            &batched_params(batch, 0), static_cast<IndexType>(outer_size),
+            static_cast<IndexType>(gather_dim_size),
+            static_cast<IndexType>(inner_size));
+        auto indices_flat = typename TTypes<Index>::ConstFlat(
+            &batched_indices(batch, 0), batched_indices.dimension(1));
+        auto out_flat = typename TTypes<T, 3>::Tensor(
+            &batched_out(batch, 0), static_cast<IndexType>(outer_size),
+            static_cast<IndexType>(N), static_cast<IndexType>(inner_size));
+
+        functor::GatherFunctor<Device, T, Index> functor;
+        const int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
+
+        OP_REQUIRES(
+            c, bad_i < 0,
+            errors::InvalidArgument(
+                "indices", SliceDebugString(indices.shape(), bad_i), " = ",
+                indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")"));
+      }
+    } else {
       auto params_flat =
           params.shaped<T, 3>({outer_size, gather_dim_size, inner_size});
       auto indices_flat = indices.flat<Index>();
       auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
+      const int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
 
       OP_REQUIRES(
           c, bad_i < 0,
@@ -117,6 +196,11 @@ class GatherOp : public OpKernel {
               indices_flat(bad_i), " is not in [0, ", gather_dim_size, ")"));
     }
   }
+
+ private:
+  // The number of batch dimensions, as passed in the batch_dims attribute.
+  // It must be less than rank(indices).
+  int32 batch_dims_ = 0;
 };
 
 #define REGISTER_GATHER_FULL(dev, type, index_type)                    \
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index 0409cadb67f..ecac2274ae8 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -172,7 +172,7 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
+      absl::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 14cd639446d..48d6813e9ca 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
 #define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include <unordered_map>
 
@@ -214,6 +214,6 @@ Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc
index 6e5714b3138..e51c782b2c2 100644
--- a/tensorflow/core/kernels/identity_reader_op.cc
+++ b/tensorflow/core/kernels/identity_reader_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/io_ops.cc.
 
 #include <memory>
+
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_base.pb.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
@@ -53,7 +55,7 @@ class IdentityReader : public ReaderBase {
     ReaderBaseState base_state;
     if (!ParseProtoUnlimited(&base_state, state)) {
       return errors::InvalidArgument("Could not parse state for ", name(), ": ",
-                                     str_util::CEscape(state));
+                                     absl::CEscape(state));
     }
     TF_RETURN_IF_ERROR(RestoreBaseState(base_state));
     return Status::OK();
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index c0ec46aacb4..213882f87eb 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/logging_ops.h"
+
 #include <iostream>
 
 #include "absl/strings/str_cat.h"
@@ -48,35 +50,33 @@ Status AppendStringToFile(const std::string& fname, StringPiece data,
 
 }  // namespace
 
-class AssertOp : public OpKernel {
- public:
-  explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_));
+AssertOp::AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_));
+}
+
+void AssertOp::Compute(OpKernelContext* ctx) {
+  const Tensor& cond = ctx->input(0);
+  OP_REQUIRES(ctx, IsLegacyScalar(cond.shape()),
+              errors::InvalidArgument("In[0] should be a scalar: ",
+                                      cond.shape().DebugString()));
+
+  if (cond.scalar<bool>()()) {
+    return;
   }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& cond = ctx->input(0);
-    OP_REQUIRES(ctx, IsLegacyScalar(cond.shape()),
-                errors::InvalidArgument("In[0] should be a scalar: ",
-                                        cond.shape().DebugString()));
-
-    if (cond.scalar<bool>()()) {
-      return;
-    }
-    string msg = "assertion failed: ";
-    for (int i = 1; i < ctx->num_inputs(); ++i) {
-      strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
-                         "]");
-      if (i < ctx->num_inputs() - 1) strings::StrAppend(&msg, " ");
-    }
-    ctx->SetStatus(errors::InvalidArgument(msg));
+  string msg = "assertion failed: ";
+  for (int i = 1; i < ctx->num_inputs(); ++i) {
+    strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
+                       "]");
+    if (i < ctx->num_inputs() - 1) strings::StrAppend(&msg, " ");
   }
+  ctx->SetStatus(errors::InvalidArgument(msg));
+}
 
- private:
-  int32 summarize_ = 0;
-};
-
-REGISTER_KERNEL_BUILDER(Name("Assert").Device(DEVICE_CPU), AssertOp);
+REGISTER_KERNEL_BUILDER(Name("Assert")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("condition")
+                            .HostMemory("data"),
+                        AssertOp);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("Assert")
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/core/kernels/logging_ops.h
new file mode 100644
index 00000000000..5cb1213998f
--- /dev/null
+++ b/tensorflow/core/kernels/logging_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class AssertOp : public OpKernel {
+ public:
+  explicit AssertOp(OpKernelConstruction* c);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  int32 summarize_ = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index a6b8be95b90..107f5a11954 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -26,9 +26,11 @@ limitations under the License.
 #include "tensorflow/core/util/matmul_autotune.h"
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
+#endif
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -111,11 +113,11 @@ bool ExplicitVectorMatrixOptimization<Eigen::half>(
 
 template <typename Device, typename T>
 struct LaunchMatMulBase {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   typedef se::blas::AlgorithmType AlgorithmType;
 #else
   typedef int64 AlgorithmType;
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   static void launch(
       OpKernelContext* ctx, const Tensor& a, const Tensor& b,
@@ -154,7 +156,7 @@ template <typename T, bool USE_CUBLAS>
 struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {};
 #endif  // TENSORFLOW_USE_SYCL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace {
 
@@ -433,7 +435,7 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
   }
 };
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename Device, typename T, bool USE_CUBLAS>
 class MatMulOp : public OpKernel {
@@ -622,13 +624,13 @@ TF_CALL_complex64(REGISTER_CPU);
 TF_CALL_complex128(REGISTER_CPU);
 #endif  // INTEL_MKL && ENABLE_MKL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_half(REGISTER_GPU);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL(T)                                         \
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 48769f3fe5d..51a4d0e8aa6 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -58,7 +58,7 @@ struct MatMulFunctor {
 
 }  // end namespace functor
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Encapsulate all the shape information that is used in matmul operations.
 class MatmulParameters {
  public:
@@ -117,7 +117,7 @@ class MatmulParameters {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index 5de0d1118af..bc7eb49fff0 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -25,22 +25,22 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 namespace {
 template <typename Scalar>
-se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* cuda_memory) {
-  se::DeviceMemoryBase wrapped(const_cast<Scalar*>(cuda_memory));
+se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* gpu_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<Scalar*>(gpu_memory));
   se::DeviceMemory<Scalar> typed(wrapped);
   return typed;
 }
 }  // namespace
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <class Scalar>
 class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
@@ -128,7 +128,7 @@ REGISTER_LINALG_OP_CPU("BatchMatrixTriangularSolve",
 REGISTER_LINALG_OP_CPU("BatchMatrixTriangularSolve",
                        (MatrixTriangularSolveOp<double>), double);
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // TODO(rmlarsen): Re-factor to
 // 1. Enable buffer forwarding from rhs->out.
@@ -253,6 +253,6 @@ REGISTER_LINALG_OP_GPU("BatchMatrixTriangularSolve",
 REGISTER_LINALG_OP_GPU("BatchMatrixTriangularSolve",
                        (MatrixTriangularSolveOpGPU<double>), double);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 1e0a93d5380..adabbb534ed 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -274,6 +274,7 @@ class MklConcatOp : public OpKernel {
       // check that ranks of all tensors match
       // and that their shapes match except for concat_dim.
       int i = 0;
+      int num_of_empty_inputs = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
       const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
@@ -313,11 +314,15 @@ class MklConcatOp : public OpKernel {
         else
           are_all_mkl_inputs = false;
 
-        if (s_dims != 4) invoke_eigen = true;
+        if (s_dims != 4 && s_dims != 2) invoke_eigen = true;
+
+        if (input_tensors[i].NumElements() == 0) num_of_empty_inputs++;
 
         ++i;
       }
 
+      if (num_of_empty_inputs == i) invoke_eigen = true;
+
       // All inputs are not in one format (TF or MKL). This is mixed input case.
       // We can potentially optimize this case by converting all TF inputs
       // to Mkl format. But currently, we fall to Eigen for this case.
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index e406081d481..57dbc89d610 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -629,6 +629,7 @@ class MklConvOp : public OpKernel {
       std::shared_ptr<ConvFwdPd> conv_fwd_pd = conv_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
                            &dst_tensor);
+
       Tensor* filter_out_tensor = nullptr;
       if (emit_filter_output) {
         AllocateFilterOutputTensor(context, *conv_fwd_pd,
@@ -762,6 +763,7 @@ class MklConvOp : public OpKernel {
     // In PadwithFusedConv OP, pad is the fourth index.
     input_index_pad_ = 3;
   }
+  void set_fuse_add(bool fuse_add) { fuse_add_ = fuse_add; }
 
   // This method is for the base class MklConvOp, which handles the
   // floating point implementation of Conv. The quantized conv implementations
@@ -777,6 +779,7 @@ class MklConvOp : public OpKernel {
     // Add fusions as post ops
     // NOTE: Fusion of BiasAdd is handled directly inside MklConvOp by
     // checking `fuse_biasadd_` flag.
+    if (fuse_add_) params.post_op_params.push_back({"sum", {1.0}});
     if (fuse_relu_) params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
   }
 
@@ -818,6 +821,34 @@ class MklConvOp : public OpKernel {
 
     AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
                               output_tf_shape, output_mkl_shape);
+    if (fuse_add_) {
+      const Tensor& add_tensor = MklGetInput(context, kInputIndex_Add);
+      MklDnnShape add_mkl_shape;
+      GetMklShape(context, kInputIndex_Add, &add_mkl_shape);
+
+      // Check if need reorder
+      if (add_mkl_shape == output_mkl_shape) {
+        CHECK((*output_tensor)->CopyFrom(add_tensor, output_tf_shape));
+      } else {
+        auto add_md =
+            add_mkl_shape.IsMklTensor()
+                ? add_mkl_shape.GetMklLayout()
+                : memory::desc(output_dims_mkl_order, MklDnnType<Toutput>(),
+                               output_mkl_shape.GetTfDataFormat());
+        auto add_pd = memory::primitive_desc(add_md, this->cpu_engine_);
+        void* add_buf = static_cast<void*>(
+            const_cast<Toutput*>(add_tensor.flat<Toutput>().data()));
+        void* dst_buf =
+            static_cast<void*>((*output_tensor)->flat<Ttemp_output>().data());
+        auto add = new memory(add_pd, add_buf);
+        auto dst = new memory(dst_pd, dst_buf);
+        auto reorder_desc = mkldnn::reorder::primitive_desc(add_pd, dst_pd);
+
+        std::vector<mkldnn::primitive> net;
+        net.push_back(mkldnn::reorder(reorder_desc, *add, *dst));
+        stream(stream::kind::eager).submit(net).wait();
+      }
+    }
   }
 
   engine cpu_engine_ = engine(engine::cpu, 0);
@@ -837,10 +868,12 @@ class MklConvOp : public OpKernel {
   bool fuse_biasadd_ = bias_enabled;
   bool fuse_relu_ = false;
   bool fuse_pad_ = pad_enabled;
+  bool fuse_add_ = false;
 
   int input_index_pad_ = 2;
 
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
+  const int kInputIndex_Add = 3;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
 
@@ -1028,10 +1061,25 @@ class MklFusedConvOp
       OP_REQUIRES(context, num_args == 1,
                   errors::InvalidArgument(
                       "Fused Conv2D must have one extra argument: bias."));
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Add"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_add(true);
+      OP_REQUIRES(
+          context, num_args == 2,
+          errors::InvalidArgument(
+              "Fused Conv2D must have two extra arguments: bias and add."));
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_add(true);
+      this->set_fuse_relu(true);
+      OP_REQUIRES(
+          context, num_args == 2,
+          errors::InvalidArgument(
+              "Fused Conv2D must have two extra arguments: bias and add."));
     } else {
       OP_REQUIRES(context, false,
                   errors::Unimplemented("Fusion is not implemented: [",
-                                        str_util::Join(fused_ops, ","), "]"));
+                                        absl::StrJoin(fused_ops, ","), "]"));
     }
 
     if (pad_enabled) {
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
index 288515de0bc..95099a1f747 100644
--- a/tensorflow/core/kernels/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -42,6 +42,11 @@ using BiasAddGraphRunner =
     std::function<void(const Tensor& input_data, const Tensor& filter_data,
                        const Tensor& bias_data, Tensor* out)>;
 
+using FusedGraphRunner =
+    std::function<void(const Tensor& input_data, const Tensor& filter_data,
+                       const Tensor& bias_data,
+                       const std::vector<string>& fused_ops, Tensor* out)>;
+
 template <typename T>
 class CommonTestUtilities : public OpsTestBase {
  public:
@@ -115,7 +120,37 @@ class CommonTestUtilities : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    test::ExpectClose(conv_2d, fused_conv_2d);
+    test::ExpectClose(conv_2d, fused_conv_2d, 1e-5);
+  }
+
+  static void VerifyFusedTensorsClose(int depth, int image_width,
+                                      int image_height, int image_batch_count,
+                                      int filter_size, int filter_count,
+                                      const std::vector<string>& fused_ops,
+                                      const FusedGraphRunner& run_default,
+                                      const FusedGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+
+    const int bias_size = filter_count;
+    Tensor bias(dtype, {bias_size});
+    bias.flat<T>() = bias.flat<T>().setRandom();
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, bias, fused_ops, &conv_2d);
+    run_fused(image, filter, bias, fused_ops, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    test::ExpectClose(conv_2d, fused_conv_2d, 1e-5);
   }
 };
 
@@ -129,43 +164,41 @@ class MklFusedConv2DOpTest : public OpsTestBase {
   static constexpr int kImageHeight = 32;
   static constexpr int kImageBatchCount = 8;
 
-  void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
-                         const Tensor& bias_data, Tensor* output,
-                         int stride = 1) {
+  void RunConv2DUnfused(const Tensor& input_data, const Tensor& filter_data,
+                        const Tensor& bias_data,
+                        const std::vector<string>& fused_ops, Tensor* output,
+                        int stride = 1) {
     auto root = tensorflow::Scope::NewRootScope();
-
-    auto conv = ops::Conv2D(
-        root.WithOpName("conv"),
-        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+    auto input_data_op =
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data));
+    Output next_op = ops::Conv2D(
+        root.WithOpName("conv"), input_data_op,
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
         {1, stride, stride, 1}, "SAME");
 
-    auto with_bias = ops::BiasAdd(
-        root.WithOpName("with_bias"), conv,
-        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+    string last_op = "";
+    if (std::find(fused_ops.begin(), fused_ops.end(), "BiasAdd") !=
+        fused_ops.end()) {
+      last_op = "with_bias";
+      next_op = ops::BiasAdd(
+          root.WithOpName(last_op), next_op,
+          ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+    }
 
-    CommonTestUtilities<T>::RunAndFetch(root, "with_bias", output);
-  }
+    if (std::find(fused_ops.begin(), fused_ops.end(), "Add") !=
+        fused_ops.end()) {
+      last_op = "with_add";
+      next_op = ops::AddN(root.WithOpName("with_add"),
+                          std::initializer_list<Input>{next_op, input_data_op});
+    }
 
-  void RunConv2DWithBiasAndRelu(const Tensor& input_data,
-                                const Tensor& filter_data,
-                                const Tensor& bias_data, Tensor* output,
-                                int stride = 1) {
-    auto root = tensorflow::Scope::NewRootScope();
+    if (std::find(fused_ops.begin(), fused_ops.end(), "Relu") !=
+        fused_ops.end()) {
+      last_op = "with_relu";
+      next_op = ops::Relu(root.WithOpName(last_op), next_op);
+    }
 
-    auto conv = ops::Conv2D(
-        root.WithOpName("conv"),
-        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
-        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
-        {1, stride, stride, 1}, "SAME");
-
-    auto with_bias = ops::BiasAdd(
-        root.WithOpName("with_bias"), conv,
-        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
-
-    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
-
-    CommonTestUtilities<T>::RunAndFetch(root, "with_relu", output);
+    CommonTestUtilities<T>::RunAndFetch(root, last_op, output);
   }
 
   void RunMklFusedConv2DOp(const Tensor& image, const Tensor& filter,
@@ -212,53 +245,35 @@ class MklFusedConv2DOpTest : public OpsTestBase {
                                 output);
   }
 
-  // Verifies that computing Conv2D+BiasAdd in a graph is identical to
-  // FusedConv2D.
-  void VerifyConv2DWithBias(int filter_size, int filter_count,
-                            int depth = kDepth, int image_width = kImageWidth,
-                            int image_height = kImageHeight,
-                            int image_batch_count = kImageBatchCount) {
-    const BiasAddGraphRunner run_default =
+  // Verifies computing unfused ops in a graph is identical to FusedConv2D.
+  void VerifyFusedConv2D(int filter_size, int filter_count,
+                         const std::vector<string>& fused_ops,
+                         int depth = kDepth, int image_width = kImageWidth,
+                         int image_height = kImageHeight,
+                         int image_batch_count = kImageBatchCount) {
+    const FusedGraphRunner run_default =
         [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, Tensor* out) {
-          RunConv2DWithBias(input_data, filter_data, bias_data, out);
+               const Tensor& bias_data, const std::vector<string>& fused_ops,
+               Tensor* out) {
+          RunConv2DUnfused(input_data, filter_data, bias_data, fused_ops, out);
         };
 
-    const BiasAddGraphRunner run_fused =
+    const FusedGraphRunner run_fused =
         [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, Tensor* out) {
-          RunMklFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"},
+               const Tensor& bias_data, const std::vector<string>& fused_ops,
+               Tensor* out) {
+          std::vector<Tensor> fused_input = {bias_data};
+          if (std::find(fused_ops.begin(), fused_ops.end(), "Add") !=
+              fused_ops.end()) {
+            fused_input.push_back(input_data);
+          }
+          RunMklFusedConv2DOp(input_data, filter_data, fused_input, fused_ops,
                               out);
         };
 
-    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+    CommonTestUtilities<T>::VerifyFusedTensorsClose(
         depth, image_width, image_height, image_batch_count, filter_size,
-        filter_count, run_default, run_fused);
-  }
-
-  // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
-  // FusedConv2D.
-  void VerifyConv2DWithBiasAndRelu(int filter_size, int filter_count,
-                                   int depth = kDepth,
-                                   int image_width = kImageWidth,
-                                   int image_height = kImageHeight,
-                                   int image_batch_count = kImageBatchCount) {
-    const BiasAddGraphRunner run_default =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, Tensor* out) {
-          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out);
-        };
-
-    const BiasAddGraphRunner run_fused =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, Tensor* out) {
-          RunMklFusedConv2DOp(input_data, filter_data, {bias_data},
-                              {"BiasAdd", "Relu"}, out);
-        };
-
-    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
-        depth, image_width, image_height, image_batch_count, filter_size,
-        filter_count, run_default, run_fused);
+        filter_count, fused_ops, run_default, run_fused);
   }
 };
 
@@ -274,32 +289,59 @@ TYPED_TEST_CASE_P(MklFusedConv2DWithBiasOpTest);
 TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolution) {
   const int filter_size = 1;
   const int filter_count = 12;
-  this->VerifyConv2DWithBias(filter_size, filter_count);
+  this->VerifyFusedConv2D(filter_size, filter_count, {"BiasAdd"});
 }
 
 TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
-  this->VerifyConv2DWithBias(filter_size, filter_count);
+  this->VerifyFusedConv2D(filter_size, filter_count, {"BiasAdd"});
 }
 
 TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndRelu) {
   const int filter_size = 1;
   const int filter_count = 12;
-  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+  this->VerifyFusedConv2D(filter_size, filter_count, {"BiasAdd", "Relu"});
 }
 
 TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndRelu) {
   const int filter_size = 3;
   const int filter_count = 12;
-  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+  this->VerifyFusedConv2D(filter_size, filter_count, {"BiasAdd", "Relu"});
 }
 
-REGISTER_TYPED_TEST_CASE_P(MklFusedConv2DWithBiasOpTest,  //
-                           OneByOneConvolution,           //
-                           SpatialConvolution,            //
-                           OneByOneConvolutionAndRelu,    //
-                           SpatialConvolutionAndRelu);
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndAdd) {
+  const int filter_size = 1;
+  const int filter_count = 3;
+  this->VerifyFusedConv2D(filter_size, filter_count, {"BiasAdd", "Add"});
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndAdd) {
+  const int filter_size = 3;
+  const int filter_count = 3;
+  this->VerifyFusedConv2D(filter_size, filter_count, {"BiasAdd", "Add"});
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndAddRelu) {
+  const int filter_size = 1;
+  const int filter_count = 3;
+  this->VerifyFusedConv2D(filter_size, filter_count,
+                          {"BiasAdd", "Add", "Relu"});
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndAddRelu) {
+  const int filter_size = 3;
+  const int filter_count = 3;
+  this->VerifyFusedConv2D(filter_size, filter_count,
+                          {"BiasAdd", "Add", "Relu"});
+}
+
+REGISTER_TYPED_TEST_CASE_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolution,
+                           SpatialConvolution, OneByOneConvolutionAndRelu,
+                           SpatialConvolutionAndRelu, OneByOneConvolutionAndAdd,
+                           SpatialConvolutionAndAdd,
+                           OneByOneConvolutionAndAddRelu,
+                           SpatialConvolutionAndAddRelu);
 
 using MklFusedBiasAddDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_CASE_P(Test, MklFusedConv2DWithBiasOpTest,
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index 6665152e3e3..610ab98bdf7 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -3,10 +3,9 @@
 #
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_kernel_library",
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 0458a400b26..870d6cbcb4d 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -164,8 +164,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
   Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -178,7 +177,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
+      absl::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
@@ -325,8 +324,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
   Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -340,7 +338,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
+      absl::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
@@ -535,8 +533,7 @@ TEST_F(NonMaxSuppressionV3OpTest, TestInconsistentBoxAndScoreShapes) {
   Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -551,7 +548,7 @@ TEST_F(NonMaxSuppressionV3OpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
+      absl::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
@@ -808,8 +805,7 @@ TEST_F(NonMaxSuppressionWithOverlapsOpTest, TestInconsistentBoxAndScoreShapes) {
   Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -823,8 +819,7 @@ TEST_F(NonMaxSuppressionWithOverlapsOpTest, TestInvalidOverlapsShape) {
   Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "overlaps must be square"))
-      << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "overlaps must be square")) << s;
 }
 
 TEST_F(NonMaxSuppressionWithOverlapsOpTest, TestThresholdGreaterOne) {
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index 7aa7d1a5861..e08abd2e03e 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -53,8 +53,8 @@ Tensor* OpsTestBase::GetOutput(int output_index) {
           new Tensor(allocator(), output->dtype(), output->shape());
       auto src = output->tensor_data();
       auto dst = managed_output->tensor_data();
-      context_->eigen_gpu_device().memcpy(const_cast<char*>(dst.data()),
-                                          src.data(), src.size());
+      context_->eigen_gpu_device().memcpyDeviceToHost(
+          const_cast<char*>(dst.data()), src.data(), src.size());
       context_->eigen_gpu_device().synchronize();
       managed_outputs_[output_index] = managed_output;
     }
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 18ed1ea26ac..5e57365e3d3 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -31,9 +31,9 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 typedef Eigen::GpuDevice GPUDevice;
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
@@ -112,12 +112,12 @@ class PackOp : public OpKernel {
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
             values[i].shaped<T, 2>({before_dim, after_dim})));
       }
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       if (std::is_same<Device, GPUDevice>::value) {
         ConcatGPU<T>(c, inputs_flat, output, &output_flat);
         return;
       }
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifdef TENSORFLOW_USE_SYCL
       if (std::is_same<Device, SYCLDevice>::value) {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
@@ -148,7 +148,7 @@ REGISTER_PACK(string);
 
 #undef REGISTER_PACK
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(type)                                       \
   REGISTER_KERNEL_BUILDER(                                       \
@@ -172,7 +172,7 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
                             .TypeConstraint<int32>("T"),
                         PackOp<CPUDevice, int32>);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL(type)                                       \
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 2c3a8f76f0c..a398e6cbaee 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
@@ -245,6 +246,13 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();
+  profiler::TraceMe trace_me(
+      [&] {
+        return absl::StrCat(
+            "PartitionedCallOp #parent_step_id=", ctx->step_id(),
+            ",function_step_id=", run_opts.step_id, "#");
+      },
+      /*level=*/2);
   lib->Run(run_opts, handle, inputs, rets,
            [rets, rendez, done, ctx, func_name,
             step_container](const Status& status) {
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index b9e015c96b5..95eade8217a 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -502,8 +502,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(),
-                                    "Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(),
+                                "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
@@ -524,8 +524,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
   AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(),
-                                    "Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(),
+                                "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index da8c46dc516..3b906e8e54b 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -104,9 +104,9 @@ class QuantizedMatMulOp : public OpKernel {
 
     OP_REQUIRES(context,
                 a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument(
-                    "Matrix size-compatible: In[0]: ", a.shape().DebugString(),
-                    ", In[1]: ", b.shape().DebugString()));
+                errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
+                                        a.shape().DebugString(),
+                                        ", In[1]: ", b.shape().DebugString()));
 
     OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)),
                 errors::InvalidArgument("shift_c must be between 0 and 31, "
diff --git a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc
index e53e2670166..c164ba2719b 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc
@@ -180,7 +180,7 @@ TEST_F(RaggedTensorToSparseTest, NoSplits) {
                    .Attr("RAGGED_RANK", 0)
                    .Attr("T", dtype)
                    .Finalize(node_def()));
-  EXPECT_TRUE(str_util::StartsWith(
+  EXPECT_TRUE(absl::StartsWith(
       InitOp().error_message(),
       "Value for attr 'RAGGED_RANK' of 0 must be at least minimum 1"));
 }
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index e39e5f2eb3b..2fa93fb529c 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -384,7 +384,7 @@ TF_CALL_int64(REGISTER_INT);
 #undef REGISTER
 #undef REGISTER_INT
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER(TYPE)                                                         \
   REGISTER_KERNEL_BUILDER(                                                     \
@@ -435,7 +435,7 @@ TF_CALL_int64(REGISTER_INT);
 #undef REGISTER
 #undef REGISTER_INT
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
index d313a021dd2..c3f138a87f6 100644
--- a/tensorflow/core/kernels/random_op.h
+++ b/tensorflow/core/kernels/random_op.h
@@ -42,7 +42,7 @@ struct FillPhiloxRandom<CPUDevice, Distribution> {
                   Distribution dist);
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 typedef Eigen::GpuDevice GPUDevice;
 // Declares the partially GPU-specialized functor struct.
 template <class Distribution>
@@ -52,7 +52,7 @@ struct FillPhiloxRandom<GPUDevice, Distribution> {
                   typename Distribution::ResultElementType* data, int64 size,
                   Distribution dist);
 };
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 9c3db8742ba..3e8413da8fd 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -68,4 +68,4 @@ template struct FillPhiloxRandom<
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
index bb7a0723800..e3203cee1d1 100644
--- a/tensorflow/core/kernels/random_op_gpu.h
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/random_op.h"
 #include "tensorflow/core/lib/random/philox_random.h"
@@ -222,14 +222,14 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
       (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
       block_size;
 
-  TF_CHECK_OK(CudaLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
-                               num_blocks, block_size, 0, d.stream(), gen, data,
-                               size, dist));
+  TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
+                              num_blocks, block_size, 0, d.stream(), gen, data,
+                              size, dist));
 }
 
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // defined(__CUDACC__)
+#endif  // defined(__CUDACC__) || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
diff --git a/tensorflow/core/kernels/reduce_join_op.cc b/tensorflow/core/kernels/reduce_join_op.cc
index e2a3b861e52..7a81dfd0369 100644
--- a/tensorflow/core/kernels/reduce_join_op.cc
+++ b/tensorflow/core/kernels/reduce_join_op.cc
@@ -172,8 +172,7 @@ class ReduceJoinOp : public OpKernel {
         curr_strings[reduction_index] =
             input_flat(output_full_index + reduction_full_index);
       }
-      output_flat(output_index) =
-          str_util::Join(curr_strings, separator_.c_str());
+      output_flat(output_index) = absl::StrJoin(curr_strings, separator_);
     }
   }
 
diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index 8daab0d6be4..c341e330178 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -146,9 +146,9 @@ Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
     }
   }
 
-  VLOG(1) << "data reshape: " << str_util::Join(data_reshape_, ",");
-  VLOG(1) << "out  reshape: " << str_util::Join(out_reshape_, ",");
-  VLOG(1) << "out    shape: " << str_util::Join(out_shape_, ",");
+  VLOG(1) << "data reshape: " << absl::StrJoin(data_reshape_, ",");
+  VLOG(1) << "out  reshape: " << absl::StrJoin(out_reshape_, ",");
+  VLOG(1) << "out    shape: " << absl::StrJoin(out_shape_, ",");
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
index 1e0731e540c..14bfeb59f2d 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -183,7 +183,7 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
     int cluster_count = 0;
     for (const NodeDef& node_def : output_graph_def_.node()) {
       const string& name = node_def.name();
-      if (str_util::StartsWith(name, REMOTE_FUSED_GRAPH_NODE_NAME)) {
+      if (absl::StartsWith(name, REMOTE_FUSED_GRAPH_NODE_NAME)) {
         ++cluster_count;
         RemoteFusedGraphExecuteInfo info;
         string serialized_proto;
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index eff25f5ad49..c47bf2a6201 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -219,7 +219,7 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index 7bc40ba139a..0a19a574d7f 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -388,7 +388,7 @@ TF_CALL_double(REGISTER_GRAD_KERNEL);
 
 #undef REGISTER_GRAD_KERNEL
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_KERNEL(T)                            \
   REGISTER_KERNEL_BUILDER(Name("ResizeBilinear")      \
@@ -410,6 +410,6 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GRAD_KERNEL);
 
 #undef REGISTER_GRAD_KERNEL
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index 42dd68a2268..6d9d514a562 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/image_ops.cc.
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -103,7 +103,7 @@ __global__ void ResizeBilinearKernel(const int32 nthreads, const T* images,
                                      int batch, int in_height, int in_width,
                                      int channels, int out_height,
                                      int out_width, float* output) {
-  CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = c + channels * (x + out_width * (y + out_height * b))
     int idx = out_idx;
     const int c = idx % channels;
@@ -154,7 +154,7 @@ __global__ void ResizeBilinearGradKernel(
     const int32 nthreads, const float* input_grad, float height_scale,
     float width_scale, int batch, int original_height, int original_width,
     int channels, int resized_height, int resized_width, T* output_grad) {
-  CUDA_1D_KERNEL_LOOP(in_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(in_idx, nthreads) {
     // in_idx = c + channels * (x + resized_width * (y + resized_height * b))
     int idx = in_idx;
     const int c = idx % channels;
@@ -182,32 +182,32 @@ __global__ void ResizeBilinearGradKernel(
     const float x_lerp = original_x - floorf(original_x);
 
     const float dtop = (1 - y_lerp) * input_grad[in_idx];
-    CudaAtomicAdd(output_grad +
-                      ((b * original_height + top_y_index) * original_width +
-                       left_x_index) *
-                          channels +
-                      c,
-                  static_cast<T>((1 - x_lerp) * dtop));
-    CudaAtomicAdd(output_grad +
-                      ((b * original_height + top_y_index) * original_width +
-                       right_x_index) *
-                          channels +
-                      c,
-                  static_cast<T>(x_lerp * dtop));
+    GpuAtomicAdd(output_grad +
+                     ((b * original_height + top_y_index) * original_width +
+                      left_x_index) *
+                         channels +
+                     c,
+                 static_cast<T>((1 - x_lerp) * dtop));
+    GpuAtomicAdd(output_grad +
+                     ((b * original_height + top_y_index) * original_width +
+                      right_x_index) *
+                         channels +
+                     c,
+                 static_cast<T>(x_lerp * dtop));
 
     const float dbottom = y_lerp * input_grad[in_idx];
-    CudaAtomicAdd(output_grad +
-                      ((b * original_height + bottom_y_index) * original_width +
-                       left_x_index) *
-                          channels +
-                      c,
-                  static_cast<T>((1 - x_lerp) * dbottom));
-    CudaAtomicAdd(output_grad +
-                      ((b * original_height + bottom_y_index) * original_width +
-                       right_x_index) *
-                          channels +
-                      c,
-                  static_cast<T>(x_lerp * dbottom));
+    GpuAtomicAdd(output_grad +
+                     ((b * original_height + bottom_y_index) * original_width +
+                      left_x_index) *
+                         channels +
+                     c,
+                 static_cast<T>((1 - x_lerp) * dbottom));
+    GpuAtomicAdd(output_grad +
+                     ((b * original_height + bottom_y_index) * original_width +
+                      right_x_index) *
+                         channels +
+                     c,
+                 static_cast<T>(x_lerp * dbottom));
   }
 }
 
@@ -218,7 +218,7 @@ __global__ void LegacyResizeBilinearKernel(const int32 nthreads,
                                            int in_height, int in_width,
                                            int channels, int out_height,
                                            int out_width, float* output) {
-  CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = c + channels * (x + out_width * (y + out_height * b))
     int idx = out_idx;
     const int c = idx % channels;
@@ -268,7 +268,7 @@ __global__ void LegacyResizeBilinearGradKernel(
     const int32 nthreads, const float* input_grad, float height_scale,
     float width_scale, int batch, int original_height, int original_width,
     int channels, int resized_height, int resized_width, T* output_grad) {
-  CUDA_1D_KERNEL_LOOP(in_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(in_idx, nthreads) {
     // in_idx = c + channels * (x + resized_width * (y + resized_height * b))
     int idx = in_idx;
     const int c = idx % channels;
@@ -293,32 +293,32 @@ __global__ void LegacyResizeBilinearGradKernel(
     const float x_lerp = original_x - left_x_index;
 
     const float dtop = (1 - y_lerp) * input_grad[in_idx];
-    CudaAtomicAdd(output_grad +
-                      ((b * original_height + top_y_index) * original_width +
-                       left_x_index) *
-                          channels +
-                      c,
-                  static_cast<T>((1 - x_lerp) * dtop));
-    CudaAtomicAdd(output_grad +
-                      ((b * original_height + top_y_index) * original_width +
-                       right_x_index) *
-                          channels +
-                      c,
-                  static_cast<T>(x_lerp * dtop));
+    GpuAtomicAdd(output_grad +
+                     ((b * original_height + top_y_index) * original_width +
+                      left_x_index) *
+                         channels +
+                     c,
+                 static_cast<T>((1 - x_lerp) * dtop));
+    GpuAtomicAdd(output_grad +
+                     ((b * original_height + top_y_index) * original_width +
+                      right_x_index) *
+                         channels +
+                     c,
+                 static_cast<T>(x_lerp * dtop));
 
     const float dbottom = y_lerp * input_grad[in_idx];
-    CudaAtomicAdd(output_grad +
-                      ((b * original_height + bottom_y_index) * original_width +
-                       left_x_index) *
-                          channels +
-                      c,
-                  static_cast<T>((1 - x_lerp) * dbottom));
-    CudaAtomicAdd(output_grad +
-                      ((b * original_height + bottom_y_index) * original_width +
-                       right_x_index) *
-                          channels +
-                      c,
-                  static_cast<T>(x_lerp * dbottom));
+    GpuAtomicAdd(output_grad +
+                     ((b * original_height + bottom_y_index) * original_width +
+                      left_x_index) *
+                         channels +
+                     c,
+                 static_cast<T>((1 - x_lerp) * dbottom));
+    GpuAtomicAdd(output_grad +
+                     ((b * original_height + bottom_y_index) * original_width +
+                      right_x_index) *
+                         channels +
+                     c,
+                 static_cast<T>(x_lerp * dbottom));
   }
 }
 
@@ -344,48 +344,22 @@ struct ResizeBilinear<GPUDevice, T> {
     const int total_count = batch * out_height * out_width * channels;
     if (total_count == 0) return;
 
-    // ResizeBilinearKernel_faster is 30 ~ 50% faster than ResizeBilinearKernel
-    // but can only be used when channels is multiple of 4 and size of input
-    // elemnt is the same as float
-    if (channels % 4 == 0 && sizeof(float) == sizeof(T) && half_pixel_centers) {
-      // since each thread reads 16 bytes, and we need at most 8 of such threads
-      // to make the full use of 128 bytes of global memroy read & write
-      const int channel_per_thread = 16 / sizeof(float);
+    GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
+    if (half_pixel_centers) {
+      TF_CHECK_OK(
+          GpuLaunchKernel(ResizeBilinearKernel<T>, dim3(config.block_count),
+                          dim3(config.thread_per_block), 0, d.stream(),
+                          config.virtual_thread_count, images.data(),
+                          height_scale, width_scale, batch, in_height, in_width,
+                          channels, out_height, out_width, output.data()));
+    } else {
+      TF_CHECK_OK(GpuLaunchKernel(
+          LegacyResizeBilinearKernel<T>, dim3(config.block_count),
+          dim3(config.thread_per_block), 0, d.stream(),
+          config.virtual_thread_count, images.data(), height_scale, width_scale,
+          batch, in_height, in_width, channels, out_height, out_width,
+          output.data()));
 
-      // since each global memroy read from L1 cahce is 128 bytes, and each thread
-      // reads 16 bytes, we need 8 threads to fully coalesce 128 bytes of read & store
-      const int max_num_channel_thread = 8;
-
-      // number of threads that will iterate through the channel dimension
-      const int num_channel_thread = std::min(max_num_channel_per_thread,
-                                              num_channels/channel_per_thread);
-
-      GpuLaunchConfig config = GetCudaLaunchConfig(out_height * out_width *
-                                                   num_channel_thread, d);
-
-      TF_CHECK_OK(CudaLaunchKernel(
-        ResizeBilinearKernel_faster<T, channel_per_thread>,
-        config.block_count, config.thread_per_block, 0, d.stream(),
-        num_channel_thread, images.data(), height_scale, width_scale, batch,
-        in_height, in_width, channels, out_height, out_width, output.data()));
-    }
-    else {
-      GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-
-      if (half_pixel_centers) {
-        TF_CHECK_OK(CudaLaunchKernel(
-            ResizeBilinearKernel<T>, config.block_count, config.thread_per_block,
-            0, d.stream(), config.virtual_thread_count, images.data(),
-            height_scale, width_scale, batch, in_height, in_width, channels,
-            out_height, out_width, output.data()));
-
-      } else {
-        TF_CHECK_OK(CudaLaunchKernel(
-            LegacyResizeBilinearKernel<T>, config.block_count,
-            config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-            images.data(), height_scale, width_scale, batch, in_height, in_width,
-            channels, out_height, out_width, output.data()));
-      }
     }
 
   }
@@ -416,7 +390,7 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     total_count = batch * original_height * original_width * channels;
     if (total_count == 0) return;
     config = GetGpuLaunchConfig(total_count, d);
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
         SetZero<T>, config.block_count, config.thread_per_block, 0, d.stream(),
         config.virtual_thread_count, output_grad.data()));
 
@@ -424,14 +398,14 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     total_count = batch * resized_height * resized_width * channels;
     config = GetGpuLaunchConfig(total_count, d);
     if (half_pixel_centers) {
-      TF_CHECK_OK(CudaLaunchKernel(
+      TF_CHECK_OK(GpuLaunchKernel(
           ResizeBilinearGradKernel<T>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
           input_grad.data(), height_scale, width_scale, batch, original_height,
           original_width, channels, resized_height, resized_width,
           output_grad.data()));
     } else {
-      TF_CHECK_OK(CudaLaunchKernel(
+      TF_CHECK_OK(GpuLaunchKernel(
           LegacyResizeBilinearGradKernel<T>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
           input_grad.data(), height_scale, width_scale, batch, original_height,
@@ -452,4 +426,4 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_GPU_SPECS);
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index dee1020d77a..d756adae2c7 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -487,7 +487,7 @@ TEST_F(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
@@ -496,7 +496,7 @@ TEST_F(ResizeBilinearOpTest, TestInvalidInputShape) {
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(), "Invalid argument: input must be 4-dimensional"))
       << s;
 }
@@ -505,7 +505,7 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeDim) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(), "Invalid argument: shape_t must be 1-dimensional"))
       << s;
 }
@@ -514,7 +514,7 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeElements) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(), "Invalid argument: shape_t must have two elements"))
       << s;
 }
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index e5381a058b8..dda4e02b984 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -140,20 +140,28 @@ void ReadVariableOp::Compute(OpKernelContext* ctx) {
                   status.ToString()));
 
   core::ScopedUnref s(variable);
-  // We're acquiring a reference to the underlying buffer while
-  // holding a shared lock to guarantee ordering of reads and
-  // writes.
-  tf_shared_lock ml(*variable->mu());
-  const Tensor* t = variable->tensor();
-  OP_REQUIRES(ctx, dtype_ == t->dtype(),
-              errors::InvalidArgument(
-                  "Trying to read variable with wrong dtype. Expected ",
-                  DataTypeString(dtype_), " got ", DataTypeString(t->dtype())));
-  if (variable->copy_on_read_mode.load()) {
-    OP_REQUIRES_OK(ctx, CopyVariable(0, ctx, t));
-  } else {
-    ctx->set_output(0, *t);
+  {
+    tf_shared_lock ml(*variable->mu());
+    // We're acquiring a reference to the underlying buffer while
+    // holding a shared lock to guarantee ordering of reads and
+    // writes when in copy-on-write mode.
+    if (!variable->copy_on_read_mode.load()) {
+      const Tensor* t = variable->tensor();
+      OP_REQUIRES(
+          ctx, dtype_ == t->dtype(),
+          errors::InvalidArgument(
+              "Trying to read variable with wrong dtype. Expected ",
+              DataTypeString(dtype_), " got ", DataTypeString(t->dtype())));
+      ctx->set_output(0, *t);
+      return;
+    }
   }
+  // Note: no need to check copy_on_read_mode again here as it only changes from
+  // false to true, never the other way around. We here do the copy under an
+  // exclusive lock to avoid racing writes.
+  mutex_lock ml(*variable->mu());
+  const Tensor* t = variable->tensor();
+  OP_REQUIRES_OK(ctx, CopyVariable(0, ctx, t));
 }
 
 ReadVariablesOp::ReadVariablesOp(OpKernelConstruction* c) : OpKernel(c) {
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
index e431226aa63..2d7652d985f 100644
--- a/tensorflow/core/kernels/roll_op_test.cc
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -373,8 +373,7 @@ TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "input must be 1-D or higher"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "input must be 1-D or higher"))
       << s;
 }
 
@@ -386,8 +385,8 @@ TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(),
-                                    "axis must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "axis must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -399,8 +398,8 @@ TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(),
-                                    "shift must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "shift must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -412,8 +411,8 @@ TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
   AddInputFromArray<int32>(TensorShape({1}), {1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(),
-                                    "shift and axis must have the same size"))
+  EXPECT_TRUE(
+      absl::StrContains(s.ToString(), "shift and axis must have the same size"))
       << s;
 }
 
@@ -425,7 +424,7 @@ TEST_F(RollOpTest, Error_AxisOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "is out of range")) << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "is out of range")) << s;
 }
 
 // isd - (inner shift dimension) The inner most dimension to be shifted.
diff --git a/tensorflow/core/kernels/sampling_kernels.cc b/tensorflow/core/kernels/sampling_kernels.cc
index a18379ddbbb..306b8d6a390 100644
--- a/tensorflow/core/kernels/sampling_kernels.cc
+++ b/tensorflow/core/kernels/sampling_kernels.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace functor {
 
 SamplingKernelType SamplingKernelTypeFromString(const StringPiece str) {
-  const string lower_case = str_util::Lowercase(str);
+  const string lower_case = absl::AsciiStrToLower(str);
   if (lower_case == "lanczos1") return Lanczos1Kernel;
   if (lower_case == "lanczos3") return Lanczos3Kernel;
   if (lower_case == "lanczos5") return Lanczos5Kernel;
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 8580891fc06..faafed367d3 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -353,7 +353,7 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
     }
   }
   if (!mismatched_errors.empty()) {
-    const string error_msg = str_util::Join(mismatched_errors, "\n");
+    const string error_msg = absl::StrJoin(mismatched_errors, "\n");
     return errors::InvalidArgument(error_msg);
   }
 
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 9c51d4e3a7d..ea03abd40f5 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -273,6 +273,7 @@ class ScatterNdUpdateOp : public OpKernel {
     if (dtype_ == DT_RESOURCE) {
       Var* v;
       OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+      core::ScopedUnref scoped_unref(v);
       Tensor* t = v->tensor();
       params = *t;
       params_shape = params.shape();
@@ -727,7 +728,7 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
     slice_shape.RemoveLastDims(1);
     return errors::InvalidArgument(
         "indices", SliceDebugString(slice_shape, bad_i), " = [",
-        str_util::Join(
+        absl::StrJoin(
             gtl::ArraySlice<Index>(&indices_flat(bad_i, 0), slice_dim), ", "),
         "] does not index into shape ", shape.DebugString());
   }
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index 95ecc69c95d..d3f6ee6dc44 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -184,7 +184,7 @@ TEST_F(ScatterNdUpdateOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(), "indices[2] = [99] does not index into shape [5,3]"))
       << s;
 }
@@ -198,7 +198,7 @@ TEST_F(ScatterNdUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(),
       "The outermost dimension of updates and indices must match. Got "
       "indices.shape [1,3,1], updates.shape [3,3]"))
@@ -216,7 +216,7 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(), "Must have updates.shape = indices.shape[:batch_dim]"))
       << s;
 }
@@ -231,7 +231,7 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       s.ToString(),
       "The outermost dimension of updates and indices must match."))
       << s;
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index 2ec8c422336..ae6548e9ef2 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -171,7 +171,7 @@ TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) {
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
+      absl::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
@@ -184,10 +184,9 @@ TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(),
-                            "Must have updates.shape = indices.shape + "
-                            "params.shape[1:] or updates.shape = [], got "))
+  EXPECT_TRUE(absl::StrContains(s.ToString(),
+                                "Must have updates.shape = indices.shape + "
+                                "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
 
@@ -202,10 +201,9 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(),
-                            "Must have updates.shape = indices.shape + "
-                            "params.shape[1:] or updates.shape = [], got "))
+  EXPECT_TRUE(absl::StrContains(s.ToString(),
+                                "Must have updates.shape = indices.shape + "
+                                "params.shape[1:] or updates.shape = [], got "))
 
       << s;
 }
@@ -220,10 +218,9 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(),
-                            "Must have updates.shape = indices.shape + "
-                            "params.shape[1:] or updates.shape = [], got "))
+  EXPECT_TRUE(absl::StrContains(s.ToString(),
+                                "Must have updates.shape = indices.shape + "
+                                "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
index 04289091456..59516b2329b 100644
--- a/tensorflow/core/kernels/set_kernels.cc
+++ b/tensorflow/core/kernels/set_kernels.cc
@@ -53,7 +53,7 @@ void CheckRankAtLeast2(OpKernelContext* ctx, const TensorShape& shape) {
 Status GroupShape(const VarDimArray& input_shape, ShapeArray* grouped_shape) {
   if (input_shape.size() < 2) {
     // TODO(irving): Why can't 2 be 1 here?
-    return errors::InvalidArgument("Shape [", str_util::Join(input_shape, ","),
+    return errors::InvalidArgument("Shape [", absl::StrJoin(input_shape, ","),
                                    "] has rank ", input_shape.size(), " < 2");
   }
   // grouped_shape is input_shape[:-1]
@@ -380,8 +380,8 @@ void SetOperationOp<T>::ApplySetOperation(const std::set<T>& set1,
 Status CheckShapesMatch(VarDimArray shape1, VarDimArray shape2) {
   if (shape1 != shape2) {
     return errors::InvalidArgument("Mismatched shapes [",
-                                   str_util::Join(shape1, ","), "] vs [",
-                                   str_util::Join(shape2, ","), "]");
+                                   absl::StrJoin(shape1, ","), "] vs [",
+                                   absl::StrJoin(shape2, ","), "]");
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 93a753787a0..54c0e9f91e5 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -56,7 +56,7 @@ template <typename Device, typename T>
 class SoftmaxOp : public OpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = str_util::StartsWith(type_string(), "Log");
+    log_ = absl::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index c11b59fe46a..263fffc6b1f 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -127,7 +127,7 @@ template <typename T>
 class SoftmaxOpGPU : public OpKernel {
  public:
   explicit SoftmaxOpGPU(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = str_util::StartsWith(type_string(), "Log");
+    log_ = absl::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/core/kernels/spacetodepth_op.cc
index 044b818e3ce..55c4a9d4c86 100644
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@@ -37,6 +37,21 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+template <typename T>
+struct RawType {
+  using type = T;
+};
+
+template <>
+struct RawType<qint8> {
+  // spacetodepth_op_gpu.cu.cc does not instantiate SpaceToDepthOpFunctor for
+  // int8, so we map qint8 to uint8. Instantiating int8 could slow down
+  // compilation and the code generated is almost the same as for uint8.
+  using type = uint8;
+};
+}  // namespace
+
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
@@ -66,17 +81,17 @@ class SpaceToDepthOp : public OpKernel {
     const Tensor& input = context->input(0);
     const int dims = input.dims();
 
-    // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I (int8x4) here.
-    constexpr bool is_int8x4 = std::is_same<T, qint8>::value;
-    OP_REQUIRES(context, (is_int8x4 == (data_format_ == FORMAT_NCHW_VECT_C)),
-                errors::InvalidArgument(
-                    "qint8 should be used with data_format NCHW_VECT_C."));
-
-    constexpr int kVect = is_int8x4 ? 4 : 1;
-    constexpr int kDims = is_int8x4 ? 5 : 4;
-    OP_REQUIRES(context, kDims == dims,
-                errors::InvalidArgument("Input rank should be: ", kDims,
-                                        " instead of: ", dims));
+    const bool is_int8x4 = (data_format_ == FORMAT_NCHW_VECT_C);
+    const int vect = is_int8x4 ? 4 : 1;
+    if (is_int8x4) {
+      OP_REQUIRES(
+          context, dims == 5,
+          errors::InvalidArgument("Input rank should be 5 instead of ", dims));
+    } else {
+      OP_REQUIRES(
+          context, dims == 4,
+          errors::InvalidArgument("Input rank should be 4 instead of ", dims));
+    }
 
     constexpr int kNumSpatialDims = 2;
     const int batch_size =
@@ -87,7 +102,7 @@ class SpaceToDepthOp : public OpKernel {
         input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'W'));
     const int input_depth =
         input.dim_size(GetTensorDimIndex<kNumSpatialDims>(data_format_, 'C')) *
-        kVect;
+        vect;
 
     // Both width and height must be divisible by block_size.
     OP_REQUIRES(context,
@@ -111,32 +126,32 @@ class SpaceToDepthOp : public OpKernel {
                                        output_width, output_depth),
                        &outputs_tensor));
 
-    auto Tinput = input.tensor<T, kDims>();
-    auto Toutput = outputs_tensor->tensor<T, kDims>();
-
     if (std::is_same<Device, GPUDevice>::value) {
-      if (is_int8x4) {
+      using RT = typename RawType<T>::type;
+      if (data_format_ == FORMAT_NCHW_VECT_C) {
         // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
         auto Tinput_v = input.template reinterpret_last_dimension<int32, 4>();
         auto Toutput_v = outputs_tensor->reinterpret_last_dimension<int32, 4>();
         functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW> functor;
         functor(context->eigen_device<GPUDevice>(), Tinput_v, block_size_,
                 Toutput_v);
-        return;
       } else if (data_format_ == FORMAT_NCHW) {
-        functor::SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> functor;
-        functor(context->eigen_device<GPUDevice>(), Tinput, block_size_,
-                Toutput);
-        return;
+        CHECK((std::is_same<T, RT>::value));
+        functor::SpaceToDepthOpFunctor<GPUDevice, RT, FORMAT_NCHW> functor;
+        functor(context->eigen_device<GPUDevice>(), input.tensor<RT, 4>(),
+                block_size_, outputs_tensor->tensor<RT, 4>());
+      } else {
+        CHECK((std::is_same<T, RT>::value));
+        functor::SpaceToDepthOpFunctor<GPUDevice, RT, FORMAT_NHWC> functor;
+        functor(context->eigen_device<GPUDevice>(), input.tensor<RT, 4>(),
+                block_size_, outputs_tensor->tensor<RT, 4>());
       }
-    }
-
-    // NOTE: Assumes data_format_ == FORMAT_NHWC here, since we have rejected
-    // (CPU && data_format_ != FORMAT_NHWC) in the constructor.
-
-    if (!is_int8x4) {
+    } else {
+      // NOTE: Assumes data_format_ == FORMAT_NHWC here, since we have rejected
+      // (CPU && data_format_ != FORMAT_NHWC) in the constructor.
       functor::SpaceToDepthOpFunctor<Device, T, FORMAT_NHWC> functor;
-      functor(context->eigen_device<Device>(), Tinput, block_size_, Toutput);
+      functor(context->eigen_device<Device>(), input.tensor<T, 4>(),
+              block_size_, outputs_tensor->tensor<T, 4>());
     }
   };
 
@@ -181,6 +196,7 @@ struct SpaceToDepthOpFunctor<CPUDevice, T, FORMAT_NHWC> {
       SpaceToDepthOp<CPUDevice, type>);
 
 TF_CALL_ALL_TYPES(REGISTER);
+TF_CALL_qint8(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 4ebb7fbcc70..8e92c9e5517 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -182,7 +182,7 @@ class StringCrosser {
     }
     // TODO(zakaria): this will copy the string twice, might effect
     // performance.
-    return str_util::Join(cross_vec, k_feature_separator);
+    return absl::StrJoin(cross_vec, k_feature_separator);
   }
 
  private:
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index af69ae9ebd5..3a5e66a0e73 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -98,8 +98,8 @@ class SparseDenseBinaryOpShared : public OpKernel {
                 errors::InvalidArgument(
                     "SparseDenseBinaryOpShared broadcasts dense to sparse "
                     "only; got incompatible shapes: [",
-                    str_util::Join(lhs_dims, ","), "] vs. [",
-                    str_util::Join(rhs_dims, ","), "]"));
+                    absl::StrJoin(lhs_dims, ","), "] vs. [",
+                    absl::StrJoin(rhs_dims, ","), "]"));
 
     Tensor *output_values = nullptr;
     Tensor dense_gathered;
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index 29577ebb4ed..4efecf16d82 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -33,7 +33,7 @@ namespace tensorflow {
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(str_util::StrContains(s, expected))
+  EXPECT_TRUE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/core/kernels/sparse_reduce_op.cc b/tensorflow/core/kernels/sparse_reduce_op.cc
index a465564739f..575d5ce54b4 100644
--- a/tensorflow/core/kernels/sparse_reduce_op.cc
+++ b/tensorflow/core/kernels/sparse_reduce_op.cc
@@ -221,7 +221,7 @@ class SparseReduceOp : public OpKernel {
       Op::template Run<T>(ctx, reduced_val, g.template values<T>());
       const int64 idx = CoordinatesToFlatIndex(g.group(), output_strides);
       out_flat(idx) = reduced_val();
-      VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
+      VLOG(2) << "coords: " << absl::StrJoin(g.group(), ",")
               << "; idx: " << idx << "; group " << Op::Name() << ": "
               << reduced_val();
     }
@@ -309,7 +309,7 @@ class SparseReduceSparseOp : public OpKernel {
       }
       out_flat(i) = reduced_val();
       i++;
-      VLOG(2) << "coords: " << str_util::Join(g.group(), ",")
+      VLOG(2) << "coords: " << absl::StrJoin(g.group(), ",")
               << "; group " << Op::Name() << ": "
               << reduced_val();
     }
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 0a97c6b6a54..be5045653a4 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -174,7 +174,7 @@ REGISTER_KERNELS_CPU(int32);
 REGISTER_KERNELS_CPU(complex64);
 REGISTER_KERNELS_CPU(complex128);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace functor {
 #define DECLARE_GPU_SPEC(T, Tindices, ADJ_A, ADJ_B)                       \
@@ -221,7 +221,7 @@ DECLARE_ADJOINT_GPU_SPEC(float);
 REGISTER_KERNELS_GPU(float);
 #undef REGISTER_GPU
 #undef REGISTER_KERNELS_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace functor {
 
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index 2b00549a9ea..f1651455190 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -35,7 +35,7 @@ __global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
   // out_{ij} = sum_k {a_ik b_kj}
   // out = A * B', out_{ij} = sum_k {a_ik (b')_kj}; b'_{kj} = b_{jk}
   const int n = (ADJ_B) ? b_cols : b_rows;
-  CUDA_1D_KERNEL_LOOP(index, nnz * p) {
+  GPU_1D_KERNEL_LOOP(index, nnz * p) {
     const int a_ix = index / p;
     const int j = index % p;
     const int i = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 1 : 0));
@@ -46,7 +46,7 @@ __global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
     // out[i, j]
     T* out_location = out + i * p + j;
     if (!FastBoundsCheck(k, n)) {
-      CudaAtomicAdd(out_location, std::numeric_limits<T>::quiet_NaN());
+      GpuAtomicAdd(out_location, std::numeric_limits<T>::quiet_NaN());
       continue;
     }
 
@@ -55,7 +55,7 @@ __global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
 
     // b_value == (ADJ_B) ? b[j, k] : b[k, j]
     const T b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j));
-    CudaAtomicAdd(out_location, a_value * b_value);
+    GpuAtomicAdd(out_location, a_value * b_value);
   }
 }
 
@@ -78,9 +78,9 @@ struct SparseTensorDenseMatMulFunctor<GPUDevice, T, Tindices, ADJ_A, ADJ_B> {
 
     // TODO(ebrevdo): Should this be alpha * nnz instead of
     // out.size()?  Perhaps p * nnz ?
-    GpuLaunchConfig config = GetCudaLaunchConfig(p * nnz, d);
+    GpuLaunchConfig config = GetGpuLaunchConfig(p * nnz, d);
 
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
         SparseTensorDenseMatMulKernel<T, Tindices, ADJ_A, ADJ_B>,
         config.block_count, config.thread_per_block, 0, d.stream(), nnz, m,
         b_rows, b_cols, p, a_indices.data(), a_values.data(), b.data(),
@@ -108,4 +108,4 @@ DEFINE(float, int64);
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 368239477b1..25e5a1f6693 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -87,7 +87,7 @@ __global__ void SplitOpKernel(const T* input, int32 prefix_dim_size,
   int32 size = prefix_dim_size * split_dim_size * suffix_dim_size;
   int32 piece_size = split_dim_size / num_split;
 
-  CUDA_1D_KERNEL_LOOP(offset, size) {
+  GPU_1D_KERNEL_LOOP(offset, size) {
     // Calculate the index into input from offset.
     int32 i = offset / (split_dim_size * suffix_dim_size);
     int32 j = (offset % (split_dim_size * suffix_dim_size)) / suffix_dim_size;
@@ -124,7 +124,11 @@ __global__ void split_v_kernel(const T* input_ptr,
   int num_outputs = output_ptr_data.size;
 
   // verbose declaration needed due to template
+#if GOOGLE_CUDA
   extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+#elif TENSORFLOW_USE_ROCM
+  HIP_DYNAMIC_SHARED(unsigned char, smem);
+#endif
   IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
 
   if (useSmem) {
@@ -144,7 +148,7 @@ __global__ void split_v_kernel(const T* input_ptr,
   // works well when there are many small segments and when the
   // segments are much longer
   IntType segment =
-      cuda_helper::upper_bound<IntType>(col_scan, num_outputs, gidx) - 1;
+      gpu_helper::upper_bound<IntType>(col_scan, num_outputs, gidx) - 1;
 
   IntType curr_offset = col_scan[segment];
   IntType curr_segment = segment;
@@ -181,7 +185,7 @@ __global__ void SplitVOpKernel_fixed(const T* input, int32 prefix_dim_size,
   int32 size = prefix_dim_size * suffix_dim_size;
   int32 piece_size = suffix_dim_size / num_split;
 
-  CUDA_1D_KERNEL_LOOP(offset, size) {
+  GPU_1D_KERNEL_LOOP(offset, size) {
     // Calculate the index into input from offset.
     int32 i = offset / suffix_dim_size;
     int32 j = offset % suffix_dim_size;
@@ -198,13 +202,13 @@ void SplitOpGPULaunch<T>::Run(const Eigen::GpuDevice& d, const T* input,
                               int32 prefix_dim_size, int32 split_dim_size,
                               int32 suffix_dim_size,
                               const GpuDeviceArrayStruct<T*>& output_ptr_data) {
-  GpuLaunchConfig config = GetCudaLaunchConfig(
-      prefix_dim_size * split_dim_size * suffix_dim_size, d);
+  GpuLaunchConfig config =
+      GetGpuLaunchConfig(prefix_dim_size * split_dim_size * suffix_dim_size, d);
 
-  TF_CHECK_OK(CudaLaunchKernel(SplitOpKernel<T>, config.block_count,
-                               config.thread_per_block, 0, d.stream(), input,
-                               prefix_dim_size, split_dim_size, suffix_dim_size,
-                               output_ptr_data));
+  TF_CHECK_OK(GpuLaunchKernel(SplitOpKernel<T>, config.block_count,
+                              config.thread_per_block, 0, d.stream(), input,
+                              prefix_dim_size, split_dim_size, suffix_dim_size,
+                              output_ptr_data));
 }
 
 template <typename T, typename IntType>
@@ -217,28 +221,29 @@ void SplitVOpGPULaunch<T, IntType>::Run(
     GpuLaunchConfig config =
         GetGpuLaunchConfig(total_rows * total_cols, gpu_device);
 
-    TF_CHECK_OK(CudaLaunchKernel(SplitVOpKernel_fixed<T>, config.block_count,
-                                 config.thread_per_block, 0,
-                                 gpu_device.stream(), input_ptr, total_rows,
-                                 total_cols, output_ptr_data));
+    TF_CHECK_OK(GpuLaunchKernel(SplitVOpKernel_fixed<T>, config.block_count,
+                                config.thread_per_block, 0, gpu_device.stream(),
+                                input_ptr, total_rows, total_cols,
+                                output_ptr_data));
   } else {
-    auto config = GetCuda2DLaunchConfig(total_cols, total_rows, gpu_device);
+    auto config = GetGpu2DLaunchConfig(total_cols, total_rows, gpu_device);
     IntType smem_max = gpu_device.sharedMemPerBlock();
     IntType smem_usage = output_scan.size * sizeof(IntType);
     // performance crossover is less than using maximum available shared
     // memory on most processors possibly due to decreasing occupancy
     // 4096 inputs is a lot, most code will take the smem path
     const int32 kMaxSmemBytesPerformance = 16384;
-    if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance)
-      TF_CHECK_OK(CudaLaunchKernel(
+    if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance) {
+      TF_CHECK_OK(GpuLaunchKernel(
           split_v_kernel<T, IntType, true>, config.block_count,
           config.thread_per_block, smem_usage, gpu_device.stream(), input_ptr,
           output_scan, total_rows, total_cols, output_ptr_data));
-    else
-      TF_CHECK_OK(CudaLaunchKernel(
+    } else {
+      TF_CHECK_OK(GpuLaunchKernel(
           split_v_kernel<T, IntType, false>, config.block_count,
           config.thread_per_block, 0, gpu_device.stream(), input_ptr,
           output_scan, total_rows, total_cols, output_ptr_data));
+    }
   }
 }
 
@@ -261,4 +266,4 @@ TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index a419eedb398..da5701836f6 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/util/work_sharder.h"
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/kernels/gpu_device_array.h"
 #include "tensorflow/core/kernels/split_lib_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -266,7 +266,7 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
   }
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Partial specialization for GPU
 template <typename T>
@@ -323,7 +323,7 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
                 errors::Internal("Launch of gpu kernel for SplitOp failed"));
   }
 };
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
@@ -407,7 +407,7 @@ REGISTER_SPLIT(quint8);
 
 #undef REGISTER_SPLIT
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(type)                               \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -422,7 +422,7 @@ TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL(type)                              \
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 8e53089af0d..1eaeda927f8 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include <numeric>
 
@@ -33,12 +33,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/util/work_sharder.h"
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/kernels/gpu_device_array.h"
 #include "tensorflow/core/kernels/split_lib_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -328,7 +328,7 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
   }
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Partial specialization for GPU
 template <typename T, typename Tlen>
@@ -436,7 +436,7 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
     }
   }
 };
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_SPLIT(type, len_type)                          \
   REGISTER_KERNEL_BUILDER(Name("SplitV")                        \
@@ -456,7 +456,7 @@ TF_CALL_ALL_TYPES(REGISTER_SPLIT_LEN);
 #undef REGISTER_SPLIT_LEN
 #undef REGISTER_SPLIT
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(type, len_type)                            \
   REGISTER_KERNEL_BUILDER(Name("SplitV")                        \
@@ -496,6 +496,6 @@ REGISTER_GPU_int32(int64);
 
 #undef REGISTER_GPU_int32
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc
index 28cca9f4484..4b9c19da691 100644
--- a/tensorflow/core/kernels/string_join_op.cc
+++ b/tensorflow/core/kernels/string_join_op.cc
@@ -67,7 +67,7 @@ class StringJoinOp : public OpKernel {
       for (int j = 0; j < input_list.size(); ++j) {
         strings[j] = (is_scalar[j]) ? inputs[j](0) : inputs[j](i);
       }
-      output_flat(i) = str_util::Join(strings, separator_.c_str());
+      output_flat(i) = absl::StrJoin(strings, separator_);
     }
   }
 
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 9dcabcc5843..697c03a0082 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -123,7 +123,7 @@ TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not the same shape")) << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "not the same shape")) << s;
 }
 
 TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
@@ -134,7 +134,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
+      absl::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -146,7 +146,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
+      absl::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -257,7 +257,7 @@ TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
   AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
@@ -267,7 +267,7 @@ TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 // --------------------------------------------------------------------------
@@ -366,7 +366,7 @@ TEST_F(SummaryMergeOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}),
                             {s1.SerializeAsString(), s2.SerializeAsString()});
   Status s = RunOpKernel();
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), "Duplicate tag")) << s;
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "Duplicate tag")) << s;
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 3f51820cd55..2f7ba18dd4a 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -59,7 +59,7 @@ namespace {
 // The result is stored in V[batch] and has the same sign as the
 // real value of V (which should be computed)
 template <class Scalar>
-__global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
+__global__ void ComputeValueOfVKernel(Gpu2DLaunchConfig config, int64 m,
                                       int64 ldu, const Scalar* M,
                                       const Scalar* U, const Scalar* S,
                                       Scalar* V) {
@@ -195,7 +195,7 @@ class SvdOpGpu : public AsyncOpKernel {
       // 1. compute the (batched) sum
       const GPUDevice& d = context->eigen_device<GPUDevice>();
       d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar));
-      Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
+      Gpu2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
       TF_CHECK_OK(CudaLaunchKernel(ComputeValueOfVKernel<Scalar>,
                                    cfg2D.block_count, cfg2D.thread_per_block, 0,
                                    d.stream(), cfg2D, m, full_matrices_ ? m : p,
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
index 0060410c957..d4c23ff3c54 100644
--- a/tensorflow/core/kernels/tensor_forest/BUILD
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 cc_library(
diff --git a/tensorflow/core/kernels/tile_functor.h b/tensorflow/core/kernels/tile_functor.h
index d41b17459e6..d8ce39dcaf8 100644
--- a/tensorflow/core/kernels/tile_functor.h
+++ b/tensorflow/core/kernels/tile_functor.h
@@ -32,10 +32,10 @@ template <typename T>
 void TileSimple(const Eigen::ThreadPoolDevice& d, Tensor* out,
                 const Tensor& in);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
 void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
diff --git a/tensorflow/core/kernels/tile_functor_gpu.h b/tensorflow/core/kernels/tile_functor_gpu.h
index 7d45a9843fd..b013d68c7fe 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.h
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -35,7 +35,7 @@ __global__ void TileKernel(int nthreads, const T* src, const int32* buf,
   const int32* in_strides = buf;
   const int32* out_strides = buf + ndims;
   const int32* in_dim_sizes = buf + ndims * 2;
-  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
     int32 i_idx = 0;
     int32 t = o_idx;
     for (int i = 0; i < ndims; ++i) {
@@ -67,17 +67,17 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
   // device.
   auto num_bytes = sizeof(int64) * host_buf.size();
   auto dev_buf = d.allocate(num_bytes);
-  // NOTE: host_buf is not allocated by CudaHostAllocator, and
+  // NOTE: host_buf is not allocated by GpuHostAllocator, and
   // therefore we are doing a sync copy effectively.
   d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
   // Launch kernel to q[...] = p[...].
   const T* p = in.flat<T>().data();
   T* q = out->flat<T>().data();
-  GpuLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
+  GpuLaunchConfig cfg = GetGpuLaunchConfig(out_nelem, d);
   TF_CHECK_OK(
-      CudaLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
-                       d.stream(), cfg.virtual_thread_count, p,
-                       reinterpret_cast<const int32*>(dev_buf), ndims, q));
+      GpuLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
+                      d.stream(), cfg.virtual_thread_count, p,
+                      reinterpret_cast<const int32*>(dev_buf), ndims, q));
   // Safe to deallocate immediately after the kernel launch.
   d.deallocate(dev_buf);
 }
@@ -85,6 +85,6 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
 }  // end namespace internal
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
diff --git a/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
index c7a814c7a2c..6d337efeef5 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, bool, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
index 4dfa4bac1b6..7654e096537 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, complex128, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
index 525ede938fd..2d05c9cdfb3 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, complex64, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
index 25e024083e3..c6b4a7f3ff4 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, double, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
index f0f31370e43..8c22b5a2969 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, float, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
index 6c3810a0bc6..f876f8aab6b 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, Eigen::half, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
index 2280dcbc82d..ed3b788e542 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int16, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
index b05403badae..a066662bb27 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int32, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
index 2d83c6b3a1c..4748fb4a6cb 100644
--- a/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/tile_functor.h"
@@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int64, int64>;
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 34c6085ee1c..cee334ec707 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include <vector>
 
@@ -82,7 +82,7 @@ struct ReduceAndReshape {
 // Explicit instantiations are defined in tile_ops_{cpu,gpu}_impl.*,
 // below are their declarations.
 
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 extern template struct Tile<GPUDevice, bool, int32>;
 extern template struct Tile<GPUDevice, bool, int64>;
 extern template struct Tile<GPUDevice, float, int32>;
@@ -104,9 +104,9 @@ extern template struct Tile<GPUDevice, int64, int64>;
 #define DECLARE_CUDA_DIM(T, NDIM)                      \
   extern template struct TileGrad<GPUDevice, T, NDIM>; \
   extern template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>
-#else  // GOOGLE_CUDA
+#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define DECLARE_CUDA_DIM(T, NDIM)
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define DECLARE_TYPE(T)                              \
@@ -324,7 +324,7 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 TF_CALL_string(HANDLE_TYPE_NAME_CPU);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
@@ -334,7 +334,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
@@ -590,7 +590,7 @@ TF_CALL_half(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
@@ -599,7 +599,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if TENSORFLOW_USE_SYCL
 #define HANDLE_TYPE_NAME_SYCL(T) \
@@ -639,7 +639,7 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                             .TypeConstraint<int64>("Tmultiples"),
                         TileGradientOp<CPUDevice, int64>);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_TILE(type)                                    \
   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
                               .Device(DEVICE_GPU)                  \
@@ -685,7 +685,7 @@ TF_CALL_complex128(REGISTER_GPU)
 #undef REGISTER_GPU_TILE
 #undef REGISTER_GPU_TILE_GRAD
 #undef REGISTER_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL(type)                                        \
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl.h b/tensorflow/core/kernels/tile_ops_gpu_impl.h
index 8da337dabd2..5992bb1273c 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl.h
@@ -19,7 +19,7 @@ limitations under the License.
 // Header used to split up compilation of GPU tile ops.  For each type you want
 // to have tile ops, create a .cu.cc file containing
 //
-//   #if GOOGLE_CUDA
+//   #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 //   #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 //   DEFINE_TILE_OPS(NDIM)
 //   #endif  // GOGLE_CUDA
@@ -28,8 +28,8 @@ limitations under the License.
 //
 // NOTE(keveman): Eigen's int8 and string versions don't compile yet with nvcc.
 
-#ifndef GOOGLE_CUDA
-#error "This header must be included inside #ifdef GOOGLE_CUDA"
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error "This header must be included inside with CUDA or ROCm defined"
 #endif
 
 #define EIGEN_USE_GPU
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl_1.cu.cc b/tensorflow/core/kernels/tile_ops_gpu_impl_1.cu.cc
index 799fe9b2360..ec20a37b86e 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_1.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_1.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(1)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl_2.cu.cc b/tensorflow/core/kernels/tile_ops_gpu_impl_2.cu.cc
index 2f752a20e63..6f6b0ba666b 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_2.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_2.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(2)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl_3.cu.cc b/tensorflow/core/kernels/tile_ops_gpu_impl_3.cu.cc
index 36b8d3f1b03..3a6845c60f6 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_3.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_3.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(3)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl_4.cu.cc b/tensorflow/core/kernels/tile_ops_gpu_impl_4.cu.cc
index 6285017c7be..9d78ff248dc 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_4.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_4.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(4)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl_5.cu.cc b/tensorflow/core/kernels/tile_ops_gpu_impl_5.cu.cc
index c3c1b68efe7..096b89f2eef 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_5.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_5.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(5)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl_6.cu.cc b/tensorflow/core/kernels/tile_ops_gpu_impl_6.cu.cc
index cc6dfd1415b..ea876585854 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_6.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_6.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(6)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl_7.cu.cc b/tensorflow/core/kernels/tile_ops_gpu_impl_7.cu.cc
index 25423df17b1..00b0323b6c5 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_7.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_7.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(7)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl_8.cu.cc b/tensorflow/core/kernels/tile_ops_gpu_impl_8.cu.cc
index ee0e508fb17..3d12b659e8d 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_8.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_8.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(8)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 1c0d70c333f..0641eed6f11 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -176,10 +176,9 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
     }
   }
   for (int i = 0; i < dims; ++i) {
-    OP_REQUIRES(
-        ctx, bits[i],
-        errors::InvalidArgument(i, " is missing from {",
-                                str_util::Join(permutation, ","), "}."));
+    OP_REQUIRES(ctx, bits[i],
+                errors::InvalidArgument(i, " is missing from {",
+                                        absl::StrJoin(permutation, ","), "}."));
   }
 
   // 0-D, 1-D, and identity transposes do nothing.
diff --git a/tensorflow/core/kernels/unary_ops_composition.cc b/tensorflow/core/kernels/unary_ops_composition.cc
index 0c2cb1b39fd..0b76a603843 100644
--- a/tensorflow/core/kernels/unary_ops_composition.cc
+++ b/tensorflow/core/kernels/unary_ops_composition.cc
@@ -100,7 +100,7 @@ class UnaryOpsComposition : public OpKernel {
     OP_REQUIRES_OK(context,
                    support_.ExportComputeFns(op_names_, &fns_, &cost_));
 
-    VLOG(2) << "Composed unary op: [" << str_util::Join(op_names_, ", ")
+    VLOG(2) << "Composed unary op: [" << absl::StrJoin(op_names_, ", ")
             << "]; cost=" << cost_;
   }
 
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index adf84bae49c..dde874b02b0 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <functional>
-#include <unordered_map>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -106,7 +106,7 @@ class UniqueOp : public OpKernel {
       auto Tin = input.flat<T>();
       const int64 N = static_cast<int64>(Tin.size());
 
-      std::unordered_map<T, TIndex> uniq;
+      absl::flat_hash_map<T, TIndex> uniq;
       uniq.reserve(2 * N);
       for (Eigen::Index i = 0, j = 0; i < N; ++i) {
         auto it = uniq.insert(std::make_pair(Tin(i), j));
@@ -153,7 +153,8 @@ class UniqueOp : public OpKernel {
         return true;
       };
 
-      std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
+      absl::flat_hash_map<int64, int64, decltype(hash_fn),
+                          decltype(equal_to_fn)>
           uniq(0, hash_fn, equal_to_fn);
 
       uniq.reserve(2 * Tin.dimension(1));
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 46906092795..afeac063a2c 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -135,7 +135,7 @@ TF_CALL_ALL_TYPES(REGISTER_UNPACK);
 
 #undef REGISTER_UNPACK
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(type)                                         \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -164,7 +164,7 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
                             .TypeConstraint<int64>("T"),
                         UnpackOp<CPUDevice, int64>);
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL(type)                                         \
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index 1bf46b5e46f..b617b76a508 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/io_ops.cc.
 
 #include <memory>
+
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_base.pb.h"
@@ -70,7 +72,7 @@ class WholeFileReader : public ReaderBase {
     ReaderBaseState base_state;
     if (!ParseProtoUnlimited(&base_state, state)) {
       return errors::InvalidArgument("Could not parse state for ", name(), ": ",
-                                     str_util::CEscape(state));
+                                     absl::CEscape(state));
     }
     TF_RETURN_IF_ERROR(RestoreBaseState(base_state));
     return Status::OK();
diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index 4815f7c2cc6..38f3475c58e 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <sstream>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -131,10 +132,9 @@ inline string FormatNodeNameForError(const string& name) {
 // LINT.ThenChange(//tensorflow/python/client/session.py)
 template <typename T>
 string FormatNodeNamesForError(const T& names) {
-  return ::tensorflow::str_util::Join(
-      names, ", ", [](string* output, const string& s) {
-        ::tensorflow::strings::StrAppend(output, FormatNodeNameForError(s));
-      });
+  return absl::StrJoin(names, ", ", [](string* output, const string& s) {
+    ::tensorflow::strings::StrAppend(output, FormatNodeNameForError(s));
+  });
 }
 // LINT.IfChange
 inline string FormatColocationNodeForError(const string& name) {
@@ -143,11 +143,9 @@ inline string FormatColocationNodeForError(const string& name) {
 // LINT.ThenChange(//tensorflow/python/framework/error_interpolation.py)
 template <typename T>
 string FormatColocationNodeForError(const T& names) {
-  return ::tensorflow::str_util::Join(
-      names, ", ", [](string* output, const string& s) {
-        ::tensorflow::strings::StrAppend(output,
-                                         FormatColocationNodeForError(s));
-      });
+  return absl::StrJoin(names, ", ", [](string* output, const string& s) {
+    ::tensorflow::strings::StrAppend(output, FormatColocationNodeForError(s));
+  });
 }
 
 inline string FormatFunctionForError(const string& name) {
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index c8b24df6270..b25ba52efe6 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -14,15 +14,78 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/lib/core/status.h"
+
 #include <stdio.h>
+
+#include <deque>
 #include <map>
+
+#include "absl/base/call_once.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
+namespace {
+
+// Log sink is used to collect recent warning and error log messages to be
+// attached to the error status.
+class StatusLogSink : public TFLogSink {
+ public:
+  static StatusLogSink* GetInstance() {
+    static StatusLogSink* sink = new StatusLogSink();
+    return sink;
+  }
+
+  void enable() {
+    absl::call_once(flag_, [this] {
+      num_messages_ = 5;  // default to 5 messages
+
+      if (const char* num_msgs_str =
+              getenv("TF_WORKER_NUM_FORWARDED_LOG_MESSAGES")) {
+        if (!absl::SimpleAtoi(num_msgs_str, &num_messages_)) {
+          LOG(WARNING) << "Failed to parse env variable "
+                          "TF_WORKER_NUM_WARNING_ERROR_LOG_IN_STATUS="
+                       << num_msgs_str << " as int. Using the default value "
+                       << num_messages_ << ".";
+        }
+      }
+
+      if (num_messages_ > 0) {
+        TFAddLogSink(this);
+      }
+    });
+  }
+
+  void GetMessages(std::vector<std::string>* logs) LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+
+    for (auto& msg : messages_) {
+      logs->push_back(msg);
+    }
+  }
+
+  void Send(const TFLogEntry& entry) override LOCKS_EXCLUDED(mu_) {
+    if (entry.log_severity() < absl::LogSeverity::kWarning) return;
+
+    mutex_lock lock(mu_);
+    messages_.emplace_back(entry.ToString());
+    if (messages_.size() > num_messages_) messages_.pop_front();
+  }
+
+ private:
+  mutex mu_;
+  // for allowing repeated/concurrent calls to enable()
+  absl::once_flag flag_;
+  int num_messages_ = 0;
+  std::deque<std::string> messages_ GUARDED_BY(mu_);
+};
+
+}  // namespace
+
 Status::Status(tensorflow::error::Code code, StringPiece msg) {
   assert(code != tensorflow::error::OK);
   state_ = std::unique_ptr<State>(new State);
@@ -157,6 +220,10 @@ bool StatusGroup::IsDerived(const Status& s) {
   return s.error_message().find(kDerivedMarker) != std::string::npos;
 }
 
+void StatusGroup::ConfigureLogHistory() {
+  StatusLogSink::GetInstance()->enable();
+}
+
 void StatusGroup::Update(const Status& s) {
   if (s.ok()) {
     ++num_ok_;
@@ -178,6 +245,7 @@ static std::vector<Status> GetNonDerivedStatuses(
 }
 
 static constexpr int kMaxAggregatedStatusMessageSize = 8 * 1024;
+static constexpr int kMaxAttachedLogMessageSize = 512;
 
 // Summarize all the status objects in the StatusGroup. This is used when
 // individual Status objects in the StatusGroup are not already summarized.
@@ -186,15 +254,32 @@ Status StatusGroup::as_summary_status() const {
     return Status::OK();
   }
 
+  // Gather recent logs as a string
+  auto get_recent_logs = [this]() -> std::string {
+    if (!recent_logs_.empty()) {
+      std::vector<std::string> fmt;
+      fmt.push_back("\nRecent warning and error logs:");
+      for (auto& log : recent_logs_) {
+        // Add an indentation to make it look nicer.
+        fmt.push_back("  " + log.substr(0, kMaxAttachedLogMessageSize));
+      }
+      return absl::StrJoin(fmt, "\n");
+    } else {
+      return "";
+    }
+  };
+
   std::vector<Status> nonderived_statuses = GetNonDerivedStatuses(children_);
 
-  // If only one root status is found, return it directly.
+  // If only one root status is found, do not add summary header and footer.
   if (nonderived_statuses.size() == 1) {
-    return nonderived_statuses[0];
+    return Status(nonderived_statuses[0].code(),
+                  strings::StrCat(nonderived_statuses[0].error_message(),
+                                  get_recent_logs()));
   }
 
   if (!nonderived_statuses.empty()) {
-    std::vector<string> fmt;
+    std::vector<std::string> fmt;
 
     fmt.push_back(strings::Printf("%zu root error(s) found.",
                                   nonderived_statuses.size()));
@@ -210,9 +295,11 @@ Status StatusGroup::as_summary_status() const {
         strings::Printf("%zu derived errors ignored.",
                         children_.size() - nonderived_statuses.size()));
 
-    return Status(
-        nonderived_statuses[0].code(),
-        absl::StrJoin(fmt, "\n").substr(0, kMaxAggregatedStatusMessageSize));
+    std::string error_msg =
+        absl::StrJoin(fmt, "\n").substr(0, kMaxAggregatedStatusMessageSize);
+
+    return Status(nonderived_statuses[0].code(),
+                  strings::StrCat(error_msg, get_recent_logs()));
   } else {
     // All statuses are derived. Pick the first available status to return.
     return children_[0];
@@ -250,4 +337,9 @@ Status StatusGroup::as_concatenated_status() const {
   }
 }
 
+void StatusGroup::AttachLogMessages() {
+  recent_logs_.clear();
+  StatusLogSink::GetInstance()->GetMessages(&recent_logs_);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 48174cb65c8..9f0067bc69f 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -105,6 +105,10 @@ class StatusGroup {
   static Status MakeDerived(const Status& s);
   static bool IsDerived(const Status& s);
 
+  // Enable warning and error log collection for appending to the aggregated
+  // status. This function may be called more than once.
+  static void ConfigureLogHistory();
+
   // Return a merged status with combined child status messages with a summary.
   Status as_summary_status() const;
   // Return a merged status with combined child status messages with
@@ -116,10 +120,15 @@ class StatusGroup {
   // Augment this group with the child status `status`.
   void Update(const Status& status);
 
+  // Attach recent warning and error log messages
+  void AttachLogMessages();
+  bool HasLogMessages() const { return !recent_logs_.empty(); }
+
  private:
   bool ok_ = true;
   size_t num_ok_ = 0;
   std::vector<Status> children_;
+  std::vector<std::string> recent_logs_;  // recent warning and error logs
 };
 
 inline Status::Status(const Status& s)
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index c932458fc76..52fdfb33935 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 0e16d5b0b9f..1880ae08cc6 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <atomic>
 
+#include "absl/synchronization/barrier.h"
+#include "absl/synchronization/blocking_counter.h"
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -190,6 +192,24 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
   }
 }
 
+TEST(ThreadPool, Parallelism) {
+  // Test that if we have N threads and schedule N tasks,
+  // all tasks will be scheduled at the same time.
+  // Failure mode for this test will be episodic timeouts (does not terminate).
+  ThreadPool pool(Env::Default(), "test", kNumThreads);
+  for (int iter = 0; iter < 2000; iter++) {
+    absl::Barrier barrier(kNumThreads);
+    absl::BlockingCounter counter(kNumThreads);
+    for (int t = 0; t < kNumThreads; ++t) {
+      pool.Schedule([&]() {
+        barrier.Block();
+        counter.DecrementCount();
+      });
+    }
+    counter.Wait();
+  }
+}
+
 static void BM_Sequential(int iters) {
   ThreadPool pool(Env::Default(), "test", kNumThreads);
   // Decrement count sequentially until 0.
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index 7a64306c6e9..72ba50ab4f4 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Libraries for storing tensors in SQL databases.
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_copts")
 
diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index 3608008b301..7ab6105029e 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -287,8 +287,7 @@ TEST(InputBuffer, Seek) {
     TF_CHECK_OK(in.Seek(1 << 25));
     EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(1, &read)));
 
-    EXPECT_TRUE(
-        str_util::StrContains(in.Seek(-1).ToString(), "negative position"));
+    EXPECT_TRUE(absl::StrContains(in.Seek(-1).ToString(), "negative position"));
   }
 }
 
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index e3e69909a3a..7f820ba7373 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -300,7 +300,7 @@ TEST_F(RecordioTest, NonSequentialReadsWithCompression) {
 
 // Tests of all the error paths in log_reader.cc follow:
 void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(str_util::StrContains(s, expected))
+  EXPECT_TRUE(absl::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
diff --git a/tensorflow/core/lib/io/table_test.cc b/tensorflow/core/lib/io/table_test.cc
index d57135be720..018fcb2b763 100644
--- a/tensorflow/core/lib/io/table_test.cc
+++ b/tensorflow/core/lib/io/table_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <map>
 #include <string>
 #include <vector>
+
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/block.h"
 #include "tensorflow/core/lib/io/block_builder.h"
@@ -26,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/io/iterator.h"
 #include "tensorflow/core/lib/io/table_builder.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/platform/test.h"
@@ -357,7 +358,7 @@ class Harness : public ::testing::Test {
           string key = PickRandomKey(rnd, keys);
           model_iter = data.lower_bound(key);
           if (kVerbose)
-            fprintf(stderr, "Seek '%s'\n", str_util::CEscape(key).c_str());
+            fprintf(stderr, "Seek '%s'\n", absl::CEscape(key).c_str());
           iter->Seek(StringPiece(key));
           ASSERT_EQ(ToStringPiecePair(data, model_iter),
                     ToStringPiecePair(iter));
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 102f9ba7ea8..6fb6babe7ec 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -544,12 +544,16 @@ class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
-      for (int i = 0; i < 2; ++i) {
-        if (Eigen::numext::abs(f[i]) < kTruncateValue) {
-          results[index++] = Eigen::half(f[i]);
-          if (index >= kResultElementCount) {
-            return results;
-          }
+      if (Eigen::numext::abs(f[0]) < kTruncateValue) {
+        results[index++] = Eigen::half(f[0]);
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue) {
+        results[index++] = Eigen::half(f[1]);
+        if (index >= kResultElementCount) {
+          return results;
         }
       }
     }
@@ -586,12 +590,16 @@ class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
-      for (int i = 0; i < 2; ++i) {
-        if (Eigen::numext::abs(f[i]) < kTruncateValue) {
-          results[index++] = bfloat16(f[i]);
-          if (index >= kResultElementCount) {
-            return results;
-          }
+      if (Eigen::numext::abs(f[0]) < kTruncateValue) {
+        results[index++] = bfloat16(f[0]);
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue) {
+        results[index++] = bfloat16(f[1]);
+        if (index >= kResultElementCount) {
+          return results;
         }
       }
     }
@@ -629,12 +637,16 @@ class TruncatedNormalDistribution<SingleSampleGenerator, float> {
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
-      for (int i = 0; i < 2; ++i) {
-        if (Eigen::numext::abs(f[i]) < kTruncateValue) {
-          results[index++] = f[i];
-          if (index >= kResultElementCount) {
-            return results;
-          }
+      if (Eigen::numext::abs(f[0]) < kTruncateValue) {
+        results[index++] = f[0];
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue) {
+        results[index++] = f[1];
+        if (index >= kResultElementCount) {
+          return results;
         }
       }
     }
@@ -671,12 +683,16 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double> {
       double d[2];
       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
 
-      for (int i = 0; i < 2; ++i) {
-        if (Eigen::numext::abs(d[i]) < kTruncateValue) {
-          results[index++] = d[i];
-          if (index >= kResultElementCount) {
-            return results;
-          }
+      if (Eigen::numext::abs(d[0]) < kTruncateValue) {
+        results[index++] = d[0];
+        if (index >= kResultElementCount) {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(d[1]) < kTruncateValue) {
+        results[index++] = d[1];
+        if (index >= kResultElementCount) {
+          return results;
         }
       }
     }
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index fff6f1fedc3..dc8e5d9d6b7 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -225,7 +225,7 @@ bool safe_strto64(StringPiece str, int64* value) {
 
   int64 vlimit = kint64max;
   int sign = 1;
-  if (str_util::ConsumePrefix(&str, "-")) {
+  if (absl::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different limit for positive and negative integers.
     vlimit = kint64min;
@@ -287,7 +287,7 @@ bool safe_strto32(StringPiece str, int32* value) {
 
   int64 vmax = kint32max;
   int sign = 1;
-  if (str_util::ConsumePrefix(&str, "-")) {
+  if (absl::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different max for positive and negative integers.
     ++vmax;
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index 5334394d352..629a072ad03 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -129,7 +129,7 @@ void TestWriteAppends(T first, U second) {
   string encoded_first_only = encoded;
   OCWriteToString<U>(&encoded, second);
   EXPECT_NE(encoded, encoded_first_only);
-  EXPECT_TRUE(str_util::StartsWith(encoded, encoded_first_only));
+  EXPECT_TRUE(absl::StartsWith(encoded, encoded_first_only));
 }
 
 template <typename T>
diff --git a/tensorflow/core/lib/strings/proto_text_util.cc b/tensorflow/core/lib/strings/proto_text_util.cc
index 5e9fa24a873..38ea40b1cc4 100644
--- a/tensorflow/core/lib/strings/proto_text_util.cc
+++ b/tensorflow/core/lib/strings/proto_text_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/proto_text_util.h"
 
+#include "absl/strings/escaping.h"
+
 namespace tensorflow {
 namespace strings {
 
@@ -51,7 +53,7 @@ bool ProtoParseStringLiteralFromScanner(Scanner* scanner, string* value) {
     return false;
   }
   ProtoSpaceAndComments(scanner);
-  return str_util::CUnescape(value_sp, value, nullptr /* error */);
+  return absl::CUnescape(value_sp, value, nullptr /* error */);
 }
 
 }  // namespace strings
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index 5441dfc25a5..122044b4c91 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -78,12 +78,12 @@ bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val);
 
 // If "*s" starts with "expected", consume it and return true.
 // Otherwise, return false.
-ABSL_DEPRECATED("Use absl::ConsumeSuffix instead.")
+ABSL_DEPRECATED("Use absl::ConsumePrefix instead.")
 bool ConsumePrefix(StringPiece* s, StringPiece expected);
 
 // If "*s" ends with "expected", remove it and return true.
 // Otherwise, return false.
-ABSL_DEPRECATED("Use absl::ConsumePrefix instead.")
+ABSL_DEPRECATED("Use absl::ConsumeSuffix instead.")
 bool ConsumeSuffix(StringPiece* s, StringPiece expected);
 
 // Return lower-cased version of s.
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index 9e41da6a20d..9dc42929104 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -204,7 +204,7 @@ TEST(WavIO, ChunkSizeOverflow) {
       wav_data_string, &decoded_audio, &decoded_sample_count,
       &decoded_channel_count, &decoded_sample_rate);
   EXPECT_FALSE(decode_status.ok());
-  EXPECT_TRUE(str_util::StrContains(decode_status.error_message(), "too large"))
+  EXPECT_TRUE(absl::StrContains(decode_status.error_message(), "too large"))
       << decode_status.error_message();
 }
 
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index a19e1af8884..7740131b7c0 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -2,9 +2,10 @@
 #   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
 #   APIs are meant to change over time.
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/core/nccl/nccl_rewrite.cc b/tensorflow/core/nccl/nccl_rewrite.cc
index 06ff86e6d85..f2587f78175 100644
--- a/tensorflow/core/nccl/nccl_rewrite.cc
+++ b/tensorflow/core/nccl/nccl_rewrite.cc
@@ -256,7 +256,7 @@ class NcclReplacePass : public GraphOptimizationPass {
     // Find reduction and broadcast ops and replace them with Send/Recv ops.
     for (Node* node : graph->op_nodes()) {
       StringPiece type = node->type_string();
-      if (!str_util::StartsWith(type, "Nccl")) {
+      if (!absl::StartsWith(type, "Nccl")) {
         continue;
       }
       if (type == "NcclReduce") {
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index ccbf4177b98..719e32ac794 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -686,7 +686,7 @@ REGISTER_OP("SplitV")
                           : total_size != split_dim_size) {
             return errors::InvalidArgument(
                 "can't split axis of size ", split_dim_size,
-                " into pieces of size [", str_util::Join(data, ","), "]");
+                " into pieces of size [", absl::StrJoin(data, ","), "]");
           }
         }
       }
@@ -1113,6 +1113,7 @@ REGISTER_OP("GatherV2")
     .Input("params: Tparams")
     .Input("indices: Tindices")
     .Input("axis: Taxis")
+    .Attr("batch_dims: int = 0")
     .Output("output: Tparams")
     .Attr("Tparams: type")
     .Attr("Tindices: {int32,int64}")
@@ -1151,13 +1152,24 @@ REGISTER_OP("GatherV2")
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(
           params_shape, axis < 0 ? -axis : axis + 1, &unused));
 
+      // Note, batch_dims can be negative.
+      int32 batch_dims;
+      TF_RETURN_IF_ERROR(c->GetAttr("batch_dims", &batch_dims));
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(
+          params_shape, batch_dims < 0 ? -batch_dims : batch_dims + 1,
+          &unused));
+
       ShapeHandle params_outer_subshape;
       TF_RETURN_IF_ERROR(
           c->Subshape(params_shape, 0, axis, &params_outer_subshape));
 
+      ShapeHandle indices_inner_subshape;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(indices_shape, batch_dims, &indices_inner_subshape));
+
       ShapeHandle out;
       TF_RETURN_IF_ERROR(
-          c->Concatenate(params_outer_subshape, indices_shape, &out));
+          c->Concatenate(params_outer_subshape, indices_inner_subshape, &out));
 
       // Slice from axis + 1 to the end of params_shape to collect the inner
       // dimensions of the result. Special case -1 here since -1 + 1 wraps, and
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 92648ce1887..dae0281ec86 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -293,6 +293,7 @@ TEST(ArrayOpsTest, Gather_ShapeFn) {
 
 TEST(ArrayOpsTest, GatherV2_ShapeFn) {
   ShapeInferenceTestOp op("GatherV2");
+  AddNodeAttr("batch_dims", 0, &op.node_def);
 
   // Tests when axis is unknown.
   INFER_OK(op, "?;?;?", "?");
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 5ffb8cf9a10..bc0528c448e 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -12,8 +12,8 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
     "tf_cc_binary",
+    "tf_cc_test",
 )
 
 cc_library(
diff --git a/tensorflow/core/ops/compat/op_compatibility_lib.cc b/tensorflow/core/ops/compat/op_compatibility_lib.cc
index 45017c9da5e..a44fead871a 100644
--- a/tensorflow/core/ops/compat/op_compatibility_lib.cc
+++ b/tensorflow/core/ops/compat/op_compatibility_lib.cc
@@ -74,7 +74,7 @@ Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
     }
     if (!removed.empty()) {
       return errors::InvalidArgument("Error, stable op(s) removed: ",
-                                     str_util::Join(removed, ", "));
+                                     absl::StrJoin(removed, ", "));
     }
   }
 
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index f3a9d101016..8d2c5945609 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -84403,223 +84403,6 @@ op {
     type: "func"
   }
 }
-op {
-  name: "TPUReplicate"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "Tinputs"
-  }
-  input_arg {
-    name: "broadcast_inputs"
-    type_list_attr: "Tbroadcast_inputs"
-  }
-  input_arg {
-    name: "variables"
-    type: DT_RESOURCE
-    number_attr: "NumVariables"
-  }
-  input_arg {
-    name: "guaranteed_constants"
-    type_list_attr: "Tguaranteed_constants"
-  }
-  output_arg {
-    name: "outputs"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "computation"
-    type: "func"
-  }
-  attr {
-    name: "num_replicas"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_cores_per_replica"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "topology"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_tpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "device_assignment"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "host_compute_core"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "Tinputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tbroadcast_inputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "NumVariables"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "Tguaranteed_constants"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "padding_map"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TPUReplicate"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "Tinputs"
-  }
-  input_arg {
-    name: "broadcast_inputs"
-    type_list_attr: "Tbroadcast_inputs"
-  }
-  input_arg {
-    name: "variables"
-    type: DT_RESOURCE
-    number_attr: "NumVariables"
-  }
-  input_arg {
-    name: "guaranteed_constants"
-    type_list_attr: "Tguaranteed_constants"
-  }
-  output_arg {
-    name: "outputs"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "computation"
-    type: "func"
-  }
-  attr {
-    name: "num_replicas"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_cores_per_replica"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "topology"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_tpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "device_assignment"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "host_compute_core"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "Tinputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tbroadcast_inputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "NumVariables"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "Tguaranteed_constants"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "padding_map"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "step_marker_location"
-    type: "string"
-    default_value {
-      s: "STEP_MARK_AT_ENTRY"
-    }
-  }
-  is_stateful: true
-}
 op {
   name: "TPUReplicateMetadata"
   attr {
diff --git a/tensorflow/core/ops/compat/ops_history.v2.pbtxt b/tensorflow/core/ops/compat/ops_history.v2.pbtxt
index 97d1520b7ad..0a8f2bc0c84 100644
--- a/tensorflow/core/ops/compat/ops_history.v2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v2.pbtxt
@@ -82098,223 +82098,6 @@ op {
     type: "func"
   }
 }
-op {
-  name: "TPUReplicate"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "Tinputs"
-  }
-  input_arg {
-    name: "broadcast_inputs"
-    type_list_attr: "Tbroadcast_inputs"
-  }
-  input_arg {
-    name: "variables"
-    type: DT_RESOURCE
-    number_attr: "NumVariables"
-  }
-  input_arg {
-    name: "guaranteed_constants"
-    type_list_attr: "Tguaranteed_constants"
-  }
-  output_arg {
-    name: "outputs"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "computation"
-    type: "func"
-  }
-  attr {
-    name: "num_replicas"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_cores_per_replica"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "topology"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_tpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "device_assignment"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "host_compute_core"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "Tinputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tbroadcast_inputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "NumVariables"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "Tguaranteed_constants"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "padding_map"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "TPUReplicate"
-  input_arg {
-    name: "inputs"
-    type_list_attr: "Tinputs"
-  }
-  input_arg {
-    name: "broadcast_inputs"
-    type_list_attr: "Tbroadcast_inputs"
-  }
-  input_arg {
-    name: "variables"
-    type: DT_RESOURCE
-    number_attr: "NumVariables"
-  }
-  input_arg {
-    name: "guaranteed_constants"
-    type_list_attr: "Tguaranteed_constants"
-  }
-  output_arg {
-    name: "outputs"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "computation"
-    type: "func"
-  }
-  attr {
-    name: "num_replicas"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_cores_per_replica"
-    type: "int"
-    default_value {
-      i: 1
-    }
-  }
-  attr {
-    name: "topology"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_tpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "device_assignment"
-    type: "list(int)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "host_compute_core"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "Tinputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tbroadcast_inputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "NumVariables"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "Tguaranteed_constants"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "padding_map"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "step_marker_location"
-    type: "string"
-    default_value {
-      s: "STEP_MARK_AT_ENTRY"
-    }
-  }
-  is_stateful: true
-}
 op {
   name: "TPUReplicateMetadata"
   attr {
diff --git a/tensorflow/core/ops/cudnn_rnn_ops_test.cc b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
index 25121c6484f..788a8aceaa8 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops_test.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
@@ -50,7 +50,7 @@ TEST(CudnnRNNOpsTest, ForwardLstm_ShapeFn) {
   std::vector<int> output_shape = {seq_length, batch_size,
                                    num_units * dir_count};
   auto shape_to_str = [](const std::vector<int>& v) {
-    return strings::StrCat("[", str_util::Join(v, ","), "]");
+    return strings::StrCat("[", absl::StrJoin(v, ","), "]");
   };
   string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
@@ -82,7 +82,7 @@ TEST(CudnnRNNOpsTest, ForwardV2Lstm_ShapeFn) {
   std::vector<int> output_shape = {seq_length, batch_size,
                                    num_units * dir_count};
   auto shape_to_str = [](const std::vector<int>& v) {
-    return strings::StrCat("[", str_util::Join(v, ","), "]");
+    return strings::StrCat("[", absl::StrJoin(v, ","), "]");
   };
   string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
@@ -115,7 +115,7 @@ TEST(CudnnRNNOpsTest, ForwardV3Lstm_ShapeFn) {
                                    num_units * dir_count};
   std::vector<int> seq_lengths_shape = {batch_size};
   auto shape_to_str = [](const std::vector<int>& v) {
-    return strings::StrCat("[", str_util::Join(v, ","), "]");
+    return strings::StrCat("[", absl::StrJoin(v, ","), "]");
   };
   string input_shapes_desc = strings::StrCat(
       shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index e98827a2528..1c3c2330afa 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -221,6 +221,7 @@ REGISTER_OP("FilterDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+// This op is no longer supported.
 REGISTER_OP("FilterByLastComponentDataset")
     .Input("input_dataset: variant")
     .Output("output: variant")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 239732ab2c2..4c09a43efdb 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -370,6 +370,9 @@ REGISTER_OP("SnapshotDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("compression: string = ''")
+    .Attr("reader_path_prefix: string = ''")
+    .Attr("writer_path_prefix: string = ''")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // snapshot_path should be a scalar.
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 115dbd27df6..a4ecdcb78b7 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -413,7 +413,7 @@ class MathGradTest : public ::testing::Test {
 };
 
 void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 29a93753e7d..3ff9bc09853 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -828,6 +828,57 @@ REGISTER_OP("Select")
       return Status::OK();
     });
 
+REGISTER_OP("SelectV2")
+    .Input("condition: bool")
+    .Input("t: T")
+    .Input("e: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      auto* handle_data_1 = c->input_handle_shapes_and_types(1);
+      auto* handle_data_2 = c->input_handle_shapes_and_types(2);
+      // Merge handle shape and dtype if applicable.
+      if (handle_data_1 != nullptr && handle_data_2 != nullptr) {
+        const auto size = handle_data_1->size();
+        std::vector<shape_inference::ShapeAndType> merged_handle_data(size);
+        if (size != handle_data_2->size()) {
+          return errors::InvalidArgument(
+              "Trying to merge handles pointing to different numbers of "
+              "tensors.");
+        }
+
+        for (int i = 0; i < size; ++i) {
+          const shape_inference::ShapeAndType& s1 = (*handle_data_1)[i];
+          const shape_inference::ShapeAndType& s2 = (*handle_data_2)[i];
+          if (s1.dtype != s2.dtype) {
+            // TODO(apassos) resolve this in the manner of b/32476923
+            return errors::InvalidArgument(
+                "Trying to merge handles pointing to different dtypes.");
+          }
+          merged_handle_data[i].dtype = s1.dtype;
+          TF_RETURN_IF_ERROR(
+              c->Merge(s1.shape, s2.shape, &merged_handle_data[i].shape));
+        }
+
+        c->set_output_handle_shapes_and_types(0, merged_handle_data);
+      }
+
+      // The inputs 'cond', 'then', and 'else' must be broadcastable.
+      // TODO (yongtang): Consolidate 3-ary broadcast instead of
+      // multiple 2-ary broadcast.
+      ShapeHandle cond = c->input(0);
+      ShapeHandle then = c->input(1);
+      ShapeHandle else_ = c->input(2);
+      ShapeHandle other;
+      TF_RETURN_IF_ERROR(
+          BroadcastBinaryOpOutputShapeFnHelper(c, then, else_, &other));
+      ShapeHandle output;
+      TF_RETURN_IF_ERROR(
+          BroadcastBinaryOpOutputShapeFnHelper(c, cond, other, &output));
+      c->set_output(0, output);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("MatMul")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index a1abdb6aed2..8325f643ab5 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -244,21 +244,20 @@ TEST(MathOpsTest, Select_ShapeFn) {
 
   // Expect an error when the shapes can't be merged.
   handle_data[2]->at(0).first = shape_proto({2, 2});
-  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
-                                    "must be equal, but are 1 and 2"));
+  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().error_message(),
+                                "must be equal, but are 1 and 2"));
   handle_data[2]->at(0).first = i1;  // restore to valid
 
   // Expect an error when the types can't be merged.
   handle_data[2]->at(1).second = DT_INT64;
-  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
-                                    "pointing to different dtypes"));
+  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().error_message(),
+                                "pointing to different dtypes"));
   handle_data[2]->at(1).second = DT_INT32;  // restore to valid
 
   // Expect an error when different numbers of tensors are merged.
   handle_data[2]->push_back({i1, DT_FLOAT});
-  EXPECT_TRUE(
-      str_util::StrContains(run_inference_for_handles().error_message(),
-                            "pointing to different numbers of tensors"));
+  EXPECT_TRUE(absl::StrContains(run_inference_for_handles().error_message(),
+                                "pointing to different numbers of tensors"));
   handle_data[2]->pop_back();  // restore to valid.
 }
 
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 4d248b9f0ea..fe69a7a2359 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -201,6 +201,25 @@ REGISTER_OP("FusedBatchNormV2")
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormShape);
 
+REGISTER_OP("FusedBatchNormV3")
+    .Input("x: T")
+    .Input("scale: U")
+    .Input("offset: U")
+    .Input("mean: U")
+    .Input("variance: U")
+    .Output("y: T")
+    .Output("batch_mean: U")
+    .Output("batch_variance: U")
+    .Output("reserve_space_1: U")
+    .Output("reserve_space_2: U")
+    .Output("reserve_space_3: U")
+    .Attr("T: {half, bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormV3Shape);
+
 REGISTER_OP("FusedBatchNormGrad")
     .Input("y_backprop: T")
     .Input("x: T")
@@ -236,6 +255,24 @@ REGISTER_OP("FusedBatchNormGradV2")
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormGradShape);
 
+REGISTER_OP("FusedBatchNormGradV3")
+    .Input("y_backprop: T")
+    .Input("x: T")
+    .Input("scale: float")
+    .Input("reserve_space_1: U")
+    .Input("reserve_space_2: U")
+    .Input("reserve_space_3: U")
+    .Output("x_backprop: T")
+    .Output("scale_backprop: U")
+    .Output("offset_backprop: U")
+    .Output("reserve_space_4: U")
+    .Output("reserve_space_5: U")
+    .Attr("T: {half, bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormGradShape);
 // --------------------------------------------------------------------------
 
 REGISTER_OP("BiasAdd")
diff --git a/tensorflow/core/ops/tpu_replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc
index 7c894926005..b7fd2a18e0e 100644
--- a/tensorflow/core/ops/tpu_replication_ops.cc
+++ b/tensorflow/core/ops/tpu_replication_ops.cc
@@ -88,7 +88,7 @@ REGISTER_OP("TPUCompilationResult")
     .Output("output: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("TPUReplicate")
+REGISTER_OP("_TPUReplicate")
     .Attr("computation: func")
     .Attr("num_replicas: int >= 1")
     .Attr("num_cores_per_replica: int = 1")
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 40a808b661c..72b39362e01 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -9,9 +9,9 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_windows",
     "tf_cc_test",
     "tf_copts",
-    "if_windows",
 )
 
 cc_library(
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index 5e1eabee5b0..c64e215ea99 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -395,7 +395,7 @@ size_t CurlHttpRequest::HeaderCallback(const void* ptr, size_t size,
           .OneLiteral(": ")
           .GetResult(&value, &name)) {
     string str_value(value);
-    str_util::StripTrailingWhitespace(&str_value);
+    absl::StripTrailingAsciiWhitespace(&str_value);
     that->response_headers_[string(name)] = str_value;
   }
   return size * nmemb;
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
index 237ce6b5e5a..77850906c6c 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
@@ -68,7 +68,7 @@ class GcsDnsCacheTest : public ::testing::Test {
  protected:
   void ResolveNameTest() {
     auto response = GcsDnsCache::ResolveName("www.googleapis.com");
-    EXPECT_LT(1, response.size()) << str_util::Join(response, ", ");
+    EXPECT_LT(1, response.size()) << absl::StrJoin(response, ", ");
   }
 
   void AnnotateRequestTest() {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 10b57df183d..e98aa1bd442 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -185,7 +185,7 @@ Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
                                    fname);
   }
-  str_util::ConsumePrefix(&objectp, "/");
+  absl::ConsumePrefix(&objectp, "/");
   *object = string(objectp);
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("GCS path doesn't contain an object name: ",
@@ -548,8 +548,8 @@ class GcsWritableFile : public WritableFile {
       *uploaded = 0;
     } else {
       StringPiece range_piece(received_range);
-      str_util::ConsumePrefix(&range_piece,
-                              "bytes=");  // May or may not be present.
+      absl::ConsumePrefix(&range_piece,
+                          "bytes=");  // May or may not be present.
       std::vector<int64> range_parts;
       if (!str_util::SplitAndParseAsInts(range_piece, '-', &range_parts) ||
           range_parts.size() != 2) {
@@ -639,8 +639,7 @@ bool StringPieceIdentity(StringPiece str, StringPiece* value) {
 /// unordered set, lowercasing all values.
 bool SplitByCommaToLowercaseSet(StringPiece list,
                                 std::unordered_set<string>* set) {
-  std::vector<string> vector =
-      str_util::Split(tensorflow::str_util::Lowercase(list), ",");
+  std::vector<string> vector = absl::StrSplit(absl::AsciiStrToLower(list), ',');
   *set = std::unordered_set<string>(vector.begin(), vector.end());
   return true;
 }
@@ -891,13 +890,13 @@ std::unique_ptr<FileBlockCache> GcsFileSystem::MakeFileBlockCache(
 }
 
 // A helper function to actually read the data from GCS.
-Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
+Status GcsFileSystem::LoadBufferFromGCS(const string& fname, size_t offset,
                                         size_t n, char* buffer,
                                         size_t* bytes_transferred) {
   *bytes_transferred = 0;
 
   string bucket, object;
-  TF_RETURN_IF_ERROR(ParseGcsPath(filename, false, &bucket, &object));
+  TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
   std::unique_ptr<HttpRequest> request;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
@@ -910,7 +909,7 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.read);
 
   if (stats_ != nullptr) {
-    stats_->RecordBlockLoadRequest(filename, offset);
+    stats_->RecordBlockLoadRequest(fname, offset);
   }
 
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading gs://",
@@ -922,7 +921,7 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
           << offset << " of size: " << bytes_read;
 
   if (stats_ != nullptr) {
-    stats_->RecordBlockRetrieved(filename, offset, bytes_read);
+    stats_->RecordBlockRetrieved(fname, offset, bytes_read);
   }
 
   throttle_.RecordResponse(bytes_read);
@@ -930,11 +929,11 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   if (bytes_read < n) {
     // Check stat cache to see if we encountered an interrupted read.
     GcsFileStat stat;
-    if (stat_cache_->Lookup(filename, &stat)) {
+    if (stat_cache_->Lookup(fname, &stat)) {
       if (offset + bytes_read < stat.base.length) {
         return errors::Internal(strings::Printf(
-            "File contents are inconsistent for file: %s @ %lu.",
-            filename.c_str(), offset));
+            "File contents are inconsistent for file: %s @ %lu.", fname.c_str(),
+            offset));
       }
       VLOG(2) << "Successful integrity check for: gs://" << bucket << "/"
               << object << " @ " << offset;
@@ -1166,7 +1165,7 @@ Status GcsFileSystem::CheckBucketLocationConstraint(const string& bucket) {
   return errors::FailedPrecondition(strings::Printf(
       "Bucket '%s' is in '%s' location, allowed locations are: (%s).",
       bucket.c_str(), location.c_str(),
-      str_util::Join(allowed_locations_, ", ").c_str()));
+      absl::StrJoin(allowed_locations_, ", ").c_str()));
 }
 
 Status GcsFileSystem::GetBucketLocation(const string& bucket,
@@ -1180,7 +1179,7 @@ Status GcsFileSystem::GetBucketLocation(const string& bucket,
     TF_RETURN_IF_ERROR(
         GetStringValue(result, kBucketMetadataLocationKey, &bucket_location));
     // Lowercase the GCS location to be case insensitive for allowed locations.
-    *location = tensorflow::str_util::Lowercase(bucket_location);
+    *location = absl::AsciiStrToLower(bucket_location);
     return Status::OK();
   };
 
@@ -1338,7 +1337,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         // 'object_prefix', which is part of 'dirname', should be removed from
         // the beginning of 'name'.
         StringPiece relative_path(name);
-        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
+        if (!absl::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(strings::StrCat(
               "Unexpected response: the returned file name ", name,
               " doesn't match the prefix ", object_prefix));
@@ -1367,7 +1366,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         }
         const string& prefix_str = prefix.asString();
         StringPiece relative_path(prefix_str);
-        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
+        if (!absl::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(
               "Unexpected response: the returned folder name ", prefix_str,
               " doesn't match the prefix ", object_prefix);
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index d0840a30461..492d38fbd6c 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -98,8 +98,7 @@ class GcsFileSystem : public FileSystem {
                 std::pair<const string, const string>* additional_header);
 
   Status NewRandomAccessFile(
-      const string& filename,
-      std::unique_ptr<RandomAccessFile>* result) override;
+      const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
 
   Status NewWritableFile(const string& fname,
                          std::unique_ptr<WritableFile>* result) override;
@@ -108,7 +107,7 @@ class GcsFileSystem : public FileSystem {
                            std::unique_ptr<WritableFile>* result) override;
 
   Status NewReadOnlyMemoryRegionFromFile(
-      const string& filename,
+      const string& fname,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
   Status FileExists(const string& fname) override;
@@ -298,7 +297,7 @@ class GcsFileSystem : public FileSystem {
                                                      uint64 max_staleness);
 
   /// Loads file contents from GCS for a given filename, offset, and length.
-  Status LoadBufferFromGCS(const string& filename, size_t offset, size_t n,
+  Status LoadBufferFromGCS(const string& fname, size_t offset, size_t n,
                            char* buffer, size_t* bytes_transferred);
 
   // Clear all the caches related to the file with name `filename`.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index a998f8e3adf..e350bfb3f22 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -889,9 +889,9 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::ABORTED, status.code());
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(),
-                            "All 10 retry attempts failed. The last failure: "
-                            "Unavailable: important HTTP error 503"))
+      absl::StrContains(status.error_message(),
+                        "All 10 retry attempts failed. The last failure: "
+                        "Unavailable: important HTTP error 503"))
       << status;
 }
 
@@ -947,11 +947,11 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::UNAVAILABLE, status.code());
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(),
-                            "Upload to gs://bucket/path/writeable.txt failed, "
-                            "caused by: Not found: important HTTP error 410"))
+      absl::StrContains(status.error_message(),
+                        "Upload to gs://bucket/path/writeable.txt failed, "
+                        "caused by: Not found: important HTTP error 410"))
       << status;
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       status.error_message(), "when uploading gs://bucket/path/writeable.txt"))
       << status;
 }
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index 8a0b865499b..2b26f27f82c 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -261,8 +261,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   StringPiece result;
   char scratch[10];
   const auto& status = random_access_file->Read(0, 10, &result, scratch);
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -426,8 +425,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
 
   // Use it and check the results.
   const auto& status = writable_file->Sync();
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -457,8 +455,7 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   const auto& status =
       fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -485,8 +482,7 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
 
   std::vector<string> result;
   const auto& status = fs.GetChildren("gs://path", &result);
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -514,8 +510,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
 
   std::vector<string> result;
   const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -541,8 +536,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
 
   std::vector<string> result;
   const auto& status = fs.DeleteFile("gs://path/file.txt");
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -568,8 +562,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
 
   std::vector<string> result;
   const auto& status = fs.CreateDir("gs://path/newdir");
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -595,8 +588,7 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
 
   std::vector<string> result;
   const auto& status = fs.DeleteDir("gs://path/dir");
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -623,8 +615,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
 
   uint64 size;
   const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -648,8 +639,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.RenameFile("old_name", "new_name");
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -675,8 +665,7 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
 
   FileStatistics stat;
   const auto& status = fs.Stat("file_name", &stat);
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -688,8 +677,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.FileExists("file_name");
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -726,8 +714,7 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
       std::move(base_fs), RetryConfig(0 /* init_delay_time_us */));
 
   const auto& status = fs.IsDirectory("gs://path/dir");
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -757,8 +744,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
 
   const auto& status =
       fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Retriable error #10"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
diff --git a/tensorflow/core/platform/cloud/retrying_utils_test.cc b/tensorflow/core/platform/cloud/retrying_utils_test.cc
index 75fe8a98f45..771bb44285e 100644
--- a/tensorflow/core/platform/cloud/retrying_utils_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils_test.cc
@@ -33,7 +33,7 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
   const auto& status = RetryingUtils::CallWithRetries(
       f, sleep, RetryConfig(500000 /* init_delay_time_us */));
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(str_util::StrContains(
+  EXPECT_TRUE(absl::StrContains(
       status.error_message(),
       "All 10 retry attempts failed. The last failure: Unavailable: Failed."))
       << status;
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 26bd8542fd7..335c171f824 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -31,6 +31,23 @@ limitations under the License.
 #include <unordered_map>
 
 namespace tensorflow {
+
+void TFAddLogSink(TFLogSink* sink) {
+  // LogSink is not implemented.
+  // If necessary, one can add the log sink support as follows.
+  // 1. Define a global vector<TFLogSink> to keep track of all registered
+  //    TFLogSink objects. Protect the global vector with mutex to make it
+  //    thread-safe.
+  // 2. Add/remove elements from the global vector<TFLogSink> in TFAddLogSink
+  //    and TFRemoveLogSink function
+  // 3. Add logic in LogMessage::GenerateLogMessage() below to dispatch log
+  //    messages to all the registered log sinks.
+}
+
+void TFRemoveLogSink(TFLogSink* sink) {
+  // LogSink is not implemented.
+}
+
 namespace internal {
 
 #if defined(PLATFORM_POSIX_ANDROID)
@@ -141,7 +158,7 @@ struct StringData {
 using VmoduleMap = std::unordered_map<StringData, int, StringData::Hasher>;
 
 // Returns a mapping from module name to VLOG level, derived from the
-// TF_CPP_VMOUDLE environment variable; ownership is transferred to the caller.
+// TF_CPP_VMODULE environment variable; ownership is transferred to the caller.
 VmoduleMap* VmodulesMapFromEnv() {
   // The value of the env var is supposed to be of the form:
   //    "foo=1,bar=2,baz=3"
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 99dd6de1416..9ca8b12286a 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -21,6 +21,9 @@ limitations under the License.
 
 #include <limits>
 #include <sstream>
+
+#include "absl/base/log_severity.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -341,6 +344,59 @@ int64 MinLogLevelFromEnv();
 int64 MinVLogLevelFromEnv();
 
 }  // namespace internal
+
+// LogSink support adapted from //base/logging.h
+//
+// `LogSink` is an interface which can be extended to intercept and process
+// all log messages. LogSink implementations must be thread-safe. A single
+// instance will be called from whichever thread is performing a logging
+// operation.
+class TFLogEntry {
+  static absl::LogSeverity AsAbslLogSecurity(int severity) {
+    return static_cast<absl::LogSeverity>(severity);
+  }
+
+ public:
+  explicit TFLogEntry(int severity, absl::string_view log_line)
+      : severity_(AsAbslLogSecurity(severity)), log_line_(log_line) {}
+
+  absl::LogSeverity log_severity() const { return severity_; }
+  std::string ToString() const { return std::string(log_line_); }
+
+ private:
+  const absl::LogSeverity severity_;
+  const absl::string_view log_line_;
+};
+
+class TFLogSink {
+ public:
+  virtual ~TFLogSink() = default;
+
+  // `Send` is called synchronously during the log statement.  The logging
+  // module guarantees not to call `Send` concurrently on the same log sink.
+  // Implementations should be careful not to call`LOG` or `CHECK` or take
+  // any locks that might be held by the `LOG` caller, to avoid deadlock.
+  //
+  // `e` is guaranteed to remain valid until the subsequent call to
+  // `WaitTillSent` completes, so implementations may store a pointer to or
+  // copy of `e` (e.g. in a thread local variable) for use in `WaitTillSent`.
+  virtual void Send(const TFLogEntry& entry) = 0;
+
+  // `WaitTillSent` blocks the calling thread (the thread that generated a log
+  // message) until the sink has finished processing the log message.
+  // `WaitTillSent` is called once per log message, following the call to
+  // `Send`.  This may be useful when log messages are buffered or processed
+  // asynchronously by an expensive log sink.
+  // The default implementation returns immediately.  Like `Send`,
+  // implementations should be careful not to call `LOG` or `CHECK or take any
+  // locks that might be held by the `LOG` caller, to avoid deadlock.
+  virtual void WaitTillSent() {}
+};
+
+// Add or remove a `LogSink` as a consumer of logging data.  Thread-safe.
+void TFAddLogSink(TFLogSink* sink);
+void TFRemoveLogSink(TFLogSink* sink);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_LOGGING_H_
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 593727e850d..771f48c157a 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -52,7 +52,7 @@ GraphDef CreateTestProto() {
 }
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(str_util::StrContains(s, expected))
+  EXPECT_TRUE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
@@ -393,7 +393,7 @@ TEST_F(DefaultEnvTest, CreateUniqueFileName) {
 
   EXPECT_TRUE(env->CreateUniqueFileName(&filename, suffix));
 
-  EXPECT_TRUE(str_util::StartsWith(filename, prefix));
+  EXPECT_TRUE(absl::StartsWith(filename, prefix));
   EXPECT_TRUE(str_util::EndsWith(filename, suffix));
 }
 
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index a3a2b6c7f3c..8ab43c416bc 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -260,6 +260,16 @@ class RandomAccessFile {
   virtual Status Read(uint64 offset, size_t n, StringPiece* result,
                       char* scratch) const = 0;
 
+  // TODO(ebrevdo): Remove this ifdef when absl is updated.
+#if defined(PLATFORM_GOOGLE)
+  /// \brief Read up to `n` bytes from the file starting at `offset`.
+  virtual Status Read(uint64 offset, size_t n, absl::Cord* cord) const {
+    return errors::Unimplemented(
+        "Read(uint64, size_t, absl::Cord*) is not "
+        "implemented");
+  }
+#endif
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(RandomAccessFile);
 };
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
index 79186e4cbbf..0c4c9aaf87b 100644
--- a/tensorflow/core/platform/file_system_helper.cc
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -96,7 +96,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
               const string child_path = io::JoinPath(current_dir, children[i]);
               // In case the child_path doesn't start with the fixed_prefix then
               // we don't need to explore this path.
-              if (!str_util::StartsWith(child_path, fixed_prefix)) {
+              if (!absl::StartsWith(child_path, fixed_prefix)) {
                 children_dir_status[i] = Status(tensorflow::error::CANCELLED,
                                                 "Operation not needed");
               } else {
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index a637d42a921..a931634a3c8 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -124,7 +124,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     io::ParseURI(name, &scheme, &host, &path);
     ASSERT_EQ(scheme, "ipfs");
     ASSERT_EQ(host, "solarsystem");
-    str_util::ConsumePrefix(&path, "/");
+    absl::ConsumePrefix(&path, "/");
     *parsed_path = string(path);
   }
 
@@ -160,11 +160,11 @@ string Match(InterPlanetaryFileSystem* ipfs, const string& suffix_pattern) {
     std::sort(results.begin(), results.end());
     for (const string& result : results) {
       StringPiece trimmed_result(result);
-      EXPECT_TRUE(str_util::ConsumePrefix(&trimmed_result,
-                                          strings::StrCat(kPrefix, "/")));
+      EXPECT_TRUE(
+          absl::ConsumePrefix(&trimmed_result, strings::StrCat(kPrefix, "/")));
       trimmed_results.push_back(trimmed_result);
     }
-    return str_util::Join(trimmed_results, ",");
+    return absl::StrJoin(trimmed_results, ",");
   }
 }
 
diff --git a/tensorflow/core/platform/hadoop/BUILD b/tensorflow/core/platform/hadoop/BUILD
index e04835f4f3e..3f94facc3cd 100644
--- a/tensorflow/core/platform/hadoop/BUILD
+++ b/tensorflow/core/platform/hadoop/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index b9e8f287398..c79334e2456 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -207,7 +207,7 @@ TEST_F(HadoopFileSystemTest, WriteWhileReading) {
   // Skip the test if we're not testing on HDFS. Hadoop's local filesystem
   // implementation makes no guarantees that writable files are readable while
   // being written.
-  if (!str_util::StartsWith(fname, "hdfs://")) {
+  if (!absl::StartsWith(fname, "hdfs://")) {
     return;
   }
 
diff --git a/tensorflow/core/platform/logger.h b/tensorflow/core/platform/logger.h
index f0bfef4f2d9..fe814c39026 100644
--- a/tensorflow/core/platform/logger.h
+++ b/tensorflow/core/platform/logger.h
@@ -27,7 +27,7 @@ namespace tensorflow {
 class Logger {
  public:
   // The singleton is supposed to be used in the following steps:
-  // * At program start time, REGISTER_MOUDLE_INITIALIZER calls
+  // * At program start time, REGISTER_MODULE_INITIALIZER calls
   //   SetSingletonFactory.
   // * At some point in the program execution, Singleton() is called for the
   //   first time, initializing the logger.
diff --git a/tensorflow/core/platform/logging.h b/tensorflow/core/platform/logging.h
index 7417ec8aff6..444d50bbea4 100644
--- a/tensorflow/core/platform/logging.h
+++ b/tensorflow/core/platform/logging.h
@@ -21,6 +21,18 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) || \
     defined(GOOGLE_LOGGING) || defined(__EMSCRIPTEN__)
 #include "tensorflow/core/platform/google/build_config/logging.h"
+namespace tensorflow {
+// Adapt Google LogSink interface to the TF interface.
+using TFLogSink = ::base_logging::LogSink;
+using TFLogEntry = absl::LogEntry;
+
+inline void TFAddLogSink(TFLogSink* sink) { ::base_logging::AddLogSink(sink); }
+inline void TFRemoveLogSink(TFLogSink* sink) {
+  ::base_logging::RemoveLogSink(sink);
+}
+
+}  // namespace tensorflow
+
 #else
 #include "tensorflow/core/platform/default/logging.h"
 #endif
diff --git a/tensorflow/core/platform/platform_strings_test.cc b/tensorflow/core/platform/platform_strings_test.cc
index 5251f10d412..a4eb845c25e 100644
--- a/tensorflow/core/platform/platform_strings_test.cc
+++ b/tensorflow/core/platform/platform_strings_test.cc
@@ -61,7 +61,7 @@ static bool GetValue(const string_vec &str, const std::string &macro_name,
                      std::string *pvalue) {
   std::string nam_eq = macro_name + "=";
   int i = 0;
-  while (i != str.size() && !tensorflow::str_util::StartsWith(str[i], nam_eq)) {
+  while (i != str.size() && !absl::StartsWith(str[i], nam_eq)) {
     i++;
   }
   bool found = (i != str.size());
diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc
index 0567130e8b9..ebf2baa64f1 100644
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@@ -34,6 +34,7 @@ TEST(Port, AlignedMalloc) {
 }
 
 TEST(Port, GetCurrentCPU) {
+  GTEST_SKIP() << "Currently not stable.";  // b/132640908
   const int cpu = GetCurrentCPU();
 #if !defined(__APPLE__)
   // GetCurrentCPU does not currently work on MacOS.
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 0ff65fb6b38..50cbb23a7a2 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -69,7 +69,7 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
       // is set with a truthy value.
       const char* load_config_env = getenv("AWS_SDK_LOAD_CONFIG");
       string load_config =
-          load_config_env ? str_util::Lowercase(load_config_env) : "";
+          load_config_env ? absl::AsciiStrToLower(load_config_env) : "";
       if (load_config == "true" || load_config == "1") {
         Aws::String config_file;
         // If AWS_CONFIG_FILE is set then use it, otherwise use ~/.aws/config.
@@ -155,7 +155,7 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("S3 path doesn't contain a bucket name: ",
                                    fname);
   }
-  str_util::ConsumePrefix(&objectp, "/");
+  absl::ConsumePrefix(&objectp, "/");
   *object = string(objectp);
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("S3 path doesn't contain an object name: ",
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index 5573b2fc93f..923b51736fe 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,11 +15,22 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
-#include <cfenv>  // NOLINT
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace port {
 
+#if defined(TF_BROKEN_CFENV)
+
+ScopedSetRound::ScopedSetRound(const int mode) : original_mode_(mode) {
+  // If cfenv usage is broken, assume support only for TONEAREST.
+  DCHECK_EQ(mode, FE_TONEAREST);
+}
+
+ScopedSetRound::~ScopedSetRound() {}
+
+#else
+
 ScopedSetRound::ScopedSetRound(const int mode) {
   original_mode_ = std::fegetround();
   if (original_mode_ < 0) {
@@ -31,5 +42,7 @@ ScopedSetRound::ScopedSetRound(const int mode) {
 
 ScopedSetRound::~ScopedSetRound() { std::fesetround(original_mode_); }
 
+#endif  // TF_BROKEN_CFENV
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/setround.h b/tensorflow/core/platform/setround.h
index ded00b23b16..cc1b2a32e57 100644
--- a/tensorflow/core/platform/setround.h
+++ b/tensorflow/core/platform/setround.h
@@ -16,7 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_SETROUND_H_
 #define TENSORFLOW_CORE_PLATFORM_SETROUND_H_
 
-#include <cfenv>
+#if defined(__ANDROID_API__) && (__ANDROID_API__ < 21)
+// The <cfenv> header is broken pre-API 21 for several NDK releases.
+#define TF_BROKEN_CFENV
+#endif
+
+#if defined(TF_BROKEN_CFENV)
+#include <fenv.h>  // NOLINT
+#else
+#include <cfenv>  // NOLINT
+#endif
 
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/platform/test_main.cc b/tensorflow/core/platform/test_main.cc
index e57bbd80af4..eb89d5eba79 100644
--- a/tensorflow/core/platform/test_main.cc
+++ b/tensorflow/core/platform/test_main.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #include <iostream>
 
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "absl/strings/match.h"
 #include "tensorflow/core/platform/stacktrace_handler.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -37,7 +37,7 @@ GTEST_API_ int main(int argc, char** argv) {
   tensorflow::testing::InstallStacktraceHandler();
   testing::InitGoogleTest(&argc, argv);
   for (int i = 1; i < argc; i++) {
-    if (tensorflow::str_util::StartsWith(argv[i], "--benchmarks=")) {
+    if (absl::StartsWith(argv[i], "--benchmarks=")) {
       const char* pattern = argv[i] + strlen("--benchmarks=");
       tensorflow::testing::Benchmark::Run(pattern);
       return 0;
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 0ffd170e310..410c7582ffa 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
diff --git a/tensorflow/core/profiler/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
index 1fedb05ae31..72dd72119a2 100644
--- a/tensorflow/core/profiler/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 cc_library(
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index 96b6cc30bd9..0098b4b2f10 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -83,8 +83,8 @@ TEST_F(TFProfAdvisorTest, OperationChecker) {
   (*options.mutable_checkers())[kCheckers[1]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[1]).reports_size(), 1);
-  EXPECT_TRUE(str_util::StrContains(
-      advice.checkers().at(kCheckers[1]).reports(0), "NCHW"));
+  EXPECT_TRUE(
+      absl::StrContains(advice.checkers().at(kCheckers[1]).reports(0), "NCHW"));
 }
 
 TEST_F(TFProfAdvisorTest, UtilizationChecker) {
@@ -92,17 +92,16 @@ TEST_F(TFProfAdvisorTest, UtilizationChecker) {
   (*options.mutable_checkers())[kCheckers[0]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[0]).reports_size(), 1);
-  EXPECT_TRUE(str_util::StrContains(
-      advice.checkers().at(kCheckers[0]).reports(0), "low utilization"));
+  EXPECT_TRUE(absl::StrContains(advice.checkers().at(kCheckers[0]).reports(0),
+                                "low utilization"));
 }
 
 TEST_F(TFProfAdvisorTest, ExpensiveOperationChecker) {
   AdvisorOptionsProto options;
   (*options.mutable_checkers())[kCheckers[2]];
   AdviceProto advice = advisor_->Advise(options);
-  EXPECT_TRUE(
-      str_util::StrContains(advice.checkers().at(kCheckers[2]).reports(0),
-                            "top 1 operation type: Conv2D"));
+  EXPECT_TRUE(absl::StrContains(advice.checkers().at(kCheckers[2]).reports(0),
+                                "top 1 operation type: Conv2D"));
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index a07e51fe003..d3326381535 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cuda_library",
diff --git a/tensorflow/core/profiler/internal/runtime/BUILD b/tensorflow/core/profiler/internal/runtime/BUILD
index 085fed81578..df40613e327 100644
--- a/tensorflow/core/profiler/internal/runtime/BUILD
+++ b/tensorflow/core/profiler/internal/runtime/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cuda_library",
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index 0c26855a43e..88bcf9e1393 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -672,11 +672,11 @@ string TFCode::FormatNode(CodeNode* node, const Options& opts,
 
   if (opts.select.find(kShown[5]) != opts.select.end() &&
       !node->node->devices().empty()) {
-    attrs.push_back(str_util::Join(node->node->devices(), "|"));
+    attrs.push_back(absl::StrJoin(node->node->devices(), "|"));
   }
   if (opts.select.find(kShown[6]) != opts.select.end()) {
     std::set<string> op_types = node->node->op_types();
-    attrs.push_back(str_util::Join(op_types, "|"));
+    attrs.push_back(absl::StrJoin(op_types, "|"));
   }
   if (opts.select.find(kShown[7]) != opts.select.end()) {
     // TODO(xpan): Make op count available in code view?
@@ -688,7 +688,7 @@ string TFCode::FormatNode(CodeNode* node, const Options& opts,
 
   return strings::Printf("%s%s (%s)\n", string(indent, ' ').c_str(),
                          node->name().c_str(),
-                         str_util::Join(attrs, ", ").c_str());
+                         absl::StrJoin(attrs, ", ").c_str());
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 8796234be0c..ef875832fb2 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -194,7 +194,7 @@ void ExecStep::AddMemoryStats(const string& dev,
 
 void TFGraphNode::AddStepStat(int64 step, const string& device,
                               const NodeExecStats& step_stat) {
-  string dev = str_util::Lowercase(device);
+  string dev = absl::AsciiStrToLower(device);
 
   // TODO(xpan): Make this more robust?
   // See run_metadata_test.py
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index 6e9178c7164..21a1513a06a 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -295,12 +295,12 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
   }
 
   if (opts.select.find(kShown[5]) != opts.select.end()) {
-    attrs.push_back(str_util::Join(node->node->devices(), "|"));
+    attrs.push_back(absl::StrJoin(node->node->devices(), "|"));
   }
 
   if (opts.select.find(kShown[6]) != opts.select.end()) {
     std::set<string> op_types = node->node->op_types();
-    attrs.push_back(str_util::Join(op_types, "|"));
+    attrs.push_back(absl::StrJoin(op_types, "|"));
   }
 
   if (opts.select.find(kShown[7]) != opts.select.end()) {
@@ -315,7 +315,7 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
   }
 
   string node_str = strings::Printf("%-25s%s\n", node->name().c_str(),
-                                    str_util::Join(attrs, ", ").c_str());
+                                    absl::StrJoin(attrs, ", ").c_str());
 
   if (opts.select.find(kShown[8]) != opts.select.end()) {
     string input_shape_str = FormatInputShapes(node->proto());
diff --git a/tensorflow/core/profiler/internal/tfprof_show.cc b/tensorflow/core/profiler/internal/tfprof_show.cc
index f09cd1dad99..07145ddc243 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show.cc
@@ -201,12 +201,12 @@ string TFShow::FormatNode(ShowNode* node, const Options& opts) const {
   }
   if (opts.select.find(kShown[5]) != opts.select.end()) {
     if (node->proto().devices_size() > 0) {
-      info.push_back(str_util::Join(node->proto().devices(), "|"));
+      info.push_back(absl::StrJoin(node->proto().devices(), "|"));
     }
   }
   if (opts.select.find(kShown[6]) != opts.select.end()) {
     const std::set<string>& op_types = node->node->op_types();
-    info.push_back(str_util::Join(op_types, "|"));
+    info.push_back(absl::StrJoin(op_types, "|"));
   }
   if (opts.select.find(kShown[7]) != opts.select.end()) {
     string run = FormatNumber(node->proto().total_run_count());
@@ -230,14 +230,14 @@ string TFShow::FormatNode(ShowNode* node, const Options& opts) const {
         shape_vec.push_back(strings::Printf("%d:unknown", s.first));
       } else {
         shape_vec.push_back(strings::Printf(
-            "%d:%s", s.first, str_util::Join(s.second, "x").c_str()));
+            "%d:%s", s.first, absl::StrJoin(s.second, "x").c_str()));
       }
     }
-    info.push_back(str_util::Join(shape_vec, "|"));
+    info.push_back(absl::StrJoin(shape_vec, "|"));
   }
 
   return strings::Printf("%s (%s)", node->name().c_str(),
-                         str_util::Join(info, ", ").c_str());
+                         absl::StrJoin(info, ", ").c_str());
 }
 
 string TFShow::FormatLegend(const Options& opts) const {
@@ -286,7 +286,7 @@ string TFShow::FormatLegend(const Options& opts) const {
     legends.push_back("input shapes");
   }
   return strings::Printf("node name | %s\n",
-                         str_util::Join(legends, " | ").c_str());
+                         absl::StrJoin(legends, " | ").c_str());
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.cc b/tensorflow/core/profiler/internal/tfprof_show_multi.cc
index 7c65d48d4a1..1b45c30ad7e 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.cc
@@ -159,7 +159,7 @@ string TFMultiShow::FormatLegend(const Options& opts) const {
     legends.push_back("input shapes");
   }
   return strings::Printf("node name | %s\n",
-                         str_util::Join(legends, " | ").c_str());
+                         absl::StrJoin(legends, " | ").c_str());
 }
 
 string TFMultiShow::FormatInputShapes(const MultiGraphNodeProto& proto) const {
@@ -179,11 +179,11 @@ string TFMultiShow::FormatInputShapes(const MultiGraphNodeProto& proto) const {
         input_vec.push_back(strings::Printf("%d:unknown", s.first));
       } else {
         input_vec.push_back(strings::Printf(
-            "%d:%s", s.first, str_util::Join(s.second, "x").c_str()));
+            "%d:%s", s.first, absl::StrJoin(s.second, "x").c_str()));
       }
     }
     string shape_type_str = strings::Printf(
-        "input_type: %s", str_util::Join(input_vec, ",\t").c_str());
+        "input_type: %s", absl::StrJoin(input_vec, ",\t").c_str());
     auto t = input_shapes_attr.find(shape_type_str);
     if (t == input_shapes_attr.end()) {
       input_shapes_attr.insert(
@@ -215,7 +215,7 @@ string TFMultiShow::FormatInputShapes(const MultiGraphNodeProto& proto) const {
         "%s\t(run*%lld|defined*%lld)\texec_time: %s", s.first.c_str(),
         std::get<1>(t), std::get<0>(t), FormatTime(std::get<2>(t)).c_str()));
   }
-  return str_util::Join(input_types, "\n");
+  return absl::StrJoin(input_types, "\n");
 }
 
 std::vector<string> TFMultiShow::FormatTimes(const ShowMultiNode* node,
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 5b91309c800..b5867085ae8 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -274,7 +274,7 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
   bool has_gpu_stream = false;
 
   for (const auto& dev_stat : run_meta->step_stats().dev_stats()) {
-    string dev = str_util::Lowercase(dev_stat.device());
+    string dev = absl::AsciiStrToLower(dev_stat.device());
     if (IsPlacedOnAccelerator(dev)) {
       has_gpu_scheduling = true;
       if (CountAsAcceleratorTime(dev)) {
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
index 7712ebd926f..5ff66cf5925 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -62,7 +62,7 @@ string FormatMemory(int64 bytes) {
 }
 
 string FormatShapes(const std::vector<int64>& shape) {
-  return str_util::Join(shape, "x");
+  return absl::StrJoin(shape, "x");
 }
 
 string StringReplace(const string& str, const string& oldsub,
@@ -93,7 +93,7 @@ tensorflow::Status ReturnError(const std::vector<string>& pieces, int idx) {
 
 bool CaseEqual(StringPiece s1, StringPiece s2) {
   if (s1.size() != s2.size()) return false;
-  return str_util::Lowercase(s1) == str_util::Lowercase(s2);
+  return absl::AsciiStrToLower(s1) == absl::AsciiStrToLower(s2);
 }
 
 bool StringToBool(StringPiece str, bool* value) {
@@ -422,8 +422,8 @@ string QueryDoc(const string& cmd, const Options& opts) {
       helps.push_back("Unknown select: " + s);
     }
   }
-  return strings::StrCat("\nDoc:\n", cmd_help, "\n",
-                         str_util::Join(helps, "\n"), "\n\n");
+  return strings::StrCat("\nDoc:\n", cmd_help, "\n", absl::StrJoin(helps, "\n"),
+                         "\n\n");
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index cb6c1456ec2..8d027a18d87 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 2ec88b5d61a..2da1b60fb9f 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
@@ -20,6 +22,7 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/profiler:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index d1cc109eb63..1d1301e3334 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -14,12 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 
-#include "grpcpp/grpcpp.h"
-
 #include <cstdio>
 #include <ctime>
 #include <vector>
 
+#include "grpcpp/grpcpp.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/match.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -64,7 +65,7 @@ ProfileRequest PopulateProfileRequest(int duration_ms,
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
-  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
+  if (absl::StartsWith(repository_root, "gs://")) {
     // For backward compatibilities, only generate tracetable etc when the
     // user provide a GCS path for model directory.
     request.set_repository_root(repository_root);
@@ -156,7 +157,7 @@ Status NewSession(const string& service_addr,
       stub->NewSession(&context, new_session_request, &new_session_response)));
 
   std::cout << "Profile session succeed for host(s):"
-            << str_util::Join(hostnames, ",") << std::endl;
+            << absl::StrJoin(hostnames, ",") << std::endl;
   if (new_session_response.empty_trace()) {
     return Status(tensorflow::error::Code::UNAVAILABLE,
                   "No trace event is collected");
diff --git a/tensorflow/core/profiler/tfprof_options.cc b/tensorflow/core/profiler/tfprof_options.cc
index faca22c425b..e940a9911ef 100644
--- a/tensorflow/core/profiler/tfprof_options.cc
+++ b/tensorflow/core/profiler/tfprof_options.cc
@@ -30,7 +30,7 @@ string KeyValueToStr(const std::map<string, string>& kv_map) {
   for (const auto& pair : kv_map) {
     kv_vec.push_back(strings::StrCat(pair.first, "=", pair.second));
   }
-  return str_util::Join(kv_vec, ",");
+  return absl::StrJoin(kv_vec, ",");
 }
 }  // namespace
 
@@ -52,7 +52,7 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
           tensorflow::error::INVALID_ARGUMENT,
           strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
                           output_opt.c_str(),
-                          str_util::Join(output_types, ",").c_str()));
+                          absl::StrJoin(output_types, ",").c_str()));
     }
     *output_type = output_opt;
   } else {
@@ -62,7 +62,7 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
           tensorflow::error::INVALID_ARGUMENT,
           strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
                           output_type->c_str(),
-                          str_util::Join(output_types, ",").c_str()));
+                          absl::StrJoin(output_types, ",").c_str()));
     }
     kv_split = str_util::Split(output_opt.substr(opt_split + 1), ",",
                                str_util::SkipEmpty());
@@ -108,7 +108,7 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
                           kv[0].c_str(), output_type->c_str()));
     }
     const std::vector<string> kv_without_key(kv.begin() + 1, kv.end());
-    (*output_options)[kv[0]] = str_util::Join(kv_without_key, "=");
+    (*output_options)[kv[0]] = absl::StrJoin(kv_without_key, "=");
   }
 
   for (const string& opt : required_options) {
@@ -202,13 +202,13 @@ string Options::ToString() const {
       min_accelerator_micros, kOptions[7], min_cpu_micros, kOptions[8],
       min_params, kOptions[9], min_float_ops, kOptions[10], min_occurrence,
       kOptions[11], step, kOptions[12], order_by.c_str(), kOptions[13],
-      str_util::Join(account_type_regexes, ",").c_str(), kOptions[14],
-      str_util::Join(start_name_regexes, ",").c_str(), kOptions[15],
-      str_util::Join(trim_name_regexes, ",").c_str(), kOptions[16],
-      str_util::Join(show_name_regexes, ",").c_str(), kOptions[17],
-      str_util::Join(hide_name_regexes, ",").c_str(), kOptions[18],
+      absl::StrJoin(account_type_regexes, ",").c_str(), kOptions[14],
+      absl::StrJoin(start_name_regexes, ",").c_str(), kOptions[15],
+      absl::StrJoin(trim_name_regexes, ",").c_str(), kOptions[16],
+      absl::StrJoin(show_name_regexes, ",").c_str(), kOptions[17],
+      absl::StrJoin(hide_name_regexes, ",").c_str(), kOptions[18],
       (account_displayed_op_only ? "true" : "false"), kOptions[19],
-      str_util::Join(select, ",").c_str(), kOptions[20], output_type.c_str(),
+      absl::StrJoin(select, ",").c_str(), kOptions[20], output_type.c_str(),
       KeyValueToStr(output_options).c_str());
   return s;
 }
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 63ba4eb173c..f93ca99c63c 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -5,10 +5,10 @@ package tensorflow.eager;
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/function.proto";
+import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/versions.proto";
 import "tensorflow/core/protobuf/tensorflow_server.proto";
-import "tensorflow/core/framework/tensor_shape.proto";
-import "tensorflow/core/framework/tensor.proto";
 
 message RemoteTensorHandle {
   // The ID of the operation that produced this tensor.
@@ -71,6 +71,9 @@ message CreateContextRequest {
   // both ends use this ID for selecting a rendezvous to get everything to
   // match.
   int64 rendezvous_id = 5;
+
+  // Device attributes in the cluster
+  repeated DeviceAttributes cluster_device_attributes = 6;
 }
 
 message CreateContextResponse {
@@ -110,15 +113,13 @@ message KeepAliveRequest {
   fixed64 context_id = 1;
 }
 
-message KeepAliveResponse {
-}
+message KeepAliveResponse {}
 
 message CloseContextRequest {
   fixed64 context_id = 1;
 }
 
-message CloseContextResponse {
-}
+message CloseContextResponse {}
 
 message RegisterFunctionRequest {
   fixed64 context_id = 1;
@@ -126,8 +127,7 @@ message RegisterFunctionRequest {
   FunctionDef function_def = 2;
 }
 
-message RegisterFunctionResponse {
-}
+message RegisterFunctionResponse {}
 
 message SendTensorRequest {
   fixed64 context_id = 1;
@@ -144,8 +144,7 @@ message SendTensorRequest {
   string device_name = 4;
 }
 
-message SendTensorResponse {
-}
+message SendTensorResponse {}
 
 ////////////////////////////////////////////////////////////////////////////////
 //
diff --git a/tensorflow/core/protobuf/tpu/BUILD b/tensorflow/core/protobuf/tpu/BUILD
index ea98ee25c89..86c324780cb 100644
--- a/tensorflow/core/protobuf/tpu/BUILD
+++ b/tensorflow/core/protobuf/tpu/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/core/protobuf/tpu/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
index 6ea9ce15656..c1d25c1eafb 100644
--- a/tensorflow/core/protobuf/tpu/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -198,7 +198,7 @@ message GradientAccumulationStatus {
 
 // Configuration proto for hot ID optimization. This is an experimental feature
 // that is currently disabled (by default).
-message HotIdOptimizerConfiguration {
+message HotIdReplicationConfiguration {
   // Whether to enable or disable hot ID optimization.
   // If UNSPECIFIED (default), hot ID optimization is DISABLED.
   enum Status {
@@ -208,20 +208,9 @@ message HotIdOptimizerConfiguration {
   }
   Status status = 1;
 
-  // The following fields are never expected to be set by the TF model. However,
-  // a TF model could set them if it chooses to. If the fields are not set,
-  // meaningful default values will be chosen by the TPU software.
-
-  // Frequency above which an embedding ID is classified as hot. The valid
-  // range for the frequency is [0.0, 1.0]. The frequency of an embedding ID is
-  // defined as the ratio of the number of lookups for that ID to the total
-  // number of lookups for the embedding table.
-  float frequency_threshold = 2;
-
-  // The maximum number of hot IDs for the embedding table. If greater than
-  // max_id_count hot IDs exist for the table, the IDs with the highest
-  // frequencies are chosen.
-  int32 max_id_count = 3;
+  // The following field(s) are never expected to be set by the TF model.
+  // However, a TF model could set them if it chooses to. If the fields are not
+  // set, meaningful default values will be chosen by the TPU software.
 
   // The maximum number of slots reserved in HBM (across the entire TPU system)
   // for storing the replicas of hot IDs for the embedding table. In future, the
@@ -257,9 +246,9 @@ message OptimizationParameters {
   // apply them using the actual optimization algorithm).
   GradientAccumulationStatus.Status gradient_accumulation_status = 17;
 
-  // Configuration proto for hot ID optimization. This is an experimental
+  // Configuration proto for hot ID replication. This is an experimental
   // feature that is currently disabled (by default).
-  HotIdOptimizerConfiguration hot_id_optimizer_configuration = 18;
+  HotIdReplicationConfiguration hot_id_replication_configuration = 18;
 
   // Optimization algorithm parameters; which field is selected determines which
   // algorithm to use.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index ba48a0a7590..2c358abc65f 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 36  // Updated: 2019/5/15
+#define TF_GRAPH_DEF_VERSION 48  // Updated: 2019/5/27
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
@@ -132,8 +132,8 @@ extern const char* tf_compiler_version();
 // If no git repository, this will be "internal".
 extern const char* tf_git_version();
 // Value of the _GLIBCXX_USE_CXX11_ABI flag, or 0 if it's not set.
-extern const int tf_cxx11_abi_flag();
+extern int tf_cxx11_abi_flag();
 // Returns 1 if build is monolithic, or 0 otherwise.
-extern const int tf_monolithic_build();
+extern int tf_monolithic_build();
 
 #endif  // TENSORFLOW_CORE_PUBLIC_VERSION_H_
diff --git a/tensorflow/core/summary/BUILD b/tensorflow/core/summary/BUILD
index a89175cdb1d..8a6401d6f99 100644
--- a/tensorflow/core/summary/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   C++ implementation code for the summary writing APIs.
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/core/summary/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
index f650cb72021..41060d7fe64 100644
--- a/tensorflow/core/summary/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -60,7 +60,7 @@ class SummaryFileWriterTest : public ::testing::Test {
     TF_CHECK_OK(env_.GetChildren(testing::TmpDir(), &files));
     bool found = false;
     for (const string& f : files) {
-      if (str_util::StrContains(f, test_name)) {
+      if (absl::StrContains(f, test_name)) {
         if (found) {
           return errors::Unknown("Found more than one file for ", test_name);
         }
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index 45556d53a46..8b694d9cc32 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -156,6 +156,8 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   switch (parent.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
@@ -180,6 +182,8 @@ Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
   switch (parent->dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented(
diff --git a/tensorflow/core/util/bcast_test.cc b/tensorflow/core/util/bcast_test.cc
index 84905e3ff46..c73bb2999f3 100644
--- a/tensorflow/core/util/bcast_test.cc
+++ b/tensorflow/core/util/bcast_test.cc
@@ -30,16 +30,14 @@ string BCast(const tensorflow::BCast::Vec& x, const tensorflow::BCast::Vec& y,
     return "invalid";
   }
   string ret;
-  strings::StrAppend(&ret, "[", str_util::Join(b.x_reshape(), ","), "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.x_bcast(), ","), "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.y_reshape(), ","), "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.y_bcast(), ","), "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.result_shape(), ","), "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.output_shape(), ","), "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.grad_x_reduce_idx(), ","),
-                     "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.grad_y_reduce_idx(), ","),
-                     "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.x_reshape(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.x_bcast(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.y_reshape(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.y_bcast(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.result_shape(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.output_shape(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.grad_x_reduce_idx(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.grad_y_reduce_idx(), ","), "]");
   return ret;
 }
 
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index d919adb32f4..00a9cbaa3d8 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -31,9 +31,8 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                      const std::function<bool(string)>& hook,
                      bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (str_util::ConsumePrefix(&arg, "--") &&
-      str_util::ConsumePrefix(&arg, flag) &&
-      str_util::ConsumePrefix(&arg, "=")) {
+  if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
+      absl::ConsumePrefix(&arg, "=")) {
     *value_parsing_ok = hook(string(arg));
     return true;
   }
@@ -45,9 +44,8 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int32)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (str_util::ConsumePrefix(&arg, "--") &&
-      str_util::ConsumePrefix(&arg, flag) &&
-      str_util::ConsumePrefix(&arg, "=")) {
+  if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
+      absl::ConsumePrefix(&arg, "=")) {
     char extra;
     int32 parsed_int32;
     if (sscanf(arg.data(), "%d%c", &parsed_int32, &extra) != 1) {
@@ -67,9 +65,8 @@ bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int64)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (str_util::ConsumePrefix(&arg, "--") &&
-      str_util::ConsumePrefix(&arg, flag) &&
-      str_util::ConsumePrefix(&arg, "=")) {
+  if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
+      absl::ConsumePrefix(&arg, "=")) {
     char extra;
     int64_t parsed_int64;
     if (sscanf(arg.data(), "%" SCNd64 "%c", &parsed_int64, &extra) != 1) {
@@ -89,8 +86,7 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    const std::function<bool(bool)>& hook,
                    bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (str_util::ConsumePrefix(&arg, "--") &&
-      str_util::ConsumePrefix(&arg, flag)) {
+  if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *value_parsing_ok = hook(true);
       return true;
@@ -117,9 +113,8 @@ bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(float)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (str_util::ConsumePrefix(&arg, "--") &&
-      str_util::ConsumePrefix(&arg, flag) &&
-      str_util::ConsumePrefix(&arg, "=")) {
+  if (absl::ConsumePrefix(&arg, "--") && absl::ConsumePrefix(&arg, flag) &&
+      absl::ConsumePrefix(&arg, "=")) {
     char extra;
     float parsed_float;
     if (sscanf(arg.data(), "%f%c", &parsed_float, &extra) != 1) {
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 317420204e2..d25b85fad32 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -4,10 +4,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 
 filegroup(
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 35d34221a6c..7439a917216 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -107,7 +107,7 @@ string DeviceNameUtils::FullName(const string& job, int replica, int task,
 namespace {
 string LegacyName(const string& job, int replica, int task, const string& type,
                   int id) {
-  return DeviceName(job, replica, task, "/", str_util::Lowercase(type), id);
+  return DeviceName(job, replica, task, "/", absl::AsciiStrToLower(type), id);
 }
 }  // anonymous namespace
 
@@ -118,36 +118,36 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
   }
   while (!fullname.empty()) {
     bool progress = false;
-    if (str_util::ConsumePrefix(&fullname, "/job:")) {
-      p->has_job = !str_util::ConsumePrefix(&fullname, "*");
+    if (absl::ConsumePrefix(&fullname, "/job:")) {
+      p->has_job = !absl::ConsumePrefix(&fullname, "*");
       if (p->has_job && !ConsumeJobName(&fullname, &p->job)) {
         return false;
       }
       progress = true;
     }
-    if (str_util::ConsumePrefix(&fullname, "/replica:")) {
-      p->has_replica = !str_util::ConsumePrefix(&fullname, "*");
+    if (absl::ConsumePrefix(&fullname, "/replica:")) {
+      p->has_replica = !absl::ConsumePrefix(&fullname, "*");
       if (p->has_replica && !ConsumeNumber(&fullname, &p->replica)) {
         return false;
       }
       progress = true;
     }
-    if (str_util::ConsumePrefix(&fullname, "/task:")) {
-      p->has_task = !str_util::ConsumePrefix(&fullname, "*");
+    if (absl::ConsumePrefix(&fullname, "/task:")) {
+      p->has_task = !absl::ConsumePrefix(&fullname, "*");
       if (p->has_task && !ConsumeNumber(&fullname, &p->task)) {
         return false;
       }
       progress = true;
     }
-    if (str_util::ConsumePrefix(&fullname, "/device:")) {
-      p->has_type = !str_util::ConsumePrefix(&fullname, "*");
+    if (absl::ConsumePrefix(&fullname, "/device:")) {
+      p->has_type = !absl::ConsumePrefix(&fullname, "*");
       if (p->has_type && !ConsumeDeviceType(&fullname, &p->type)) {
         return false;
       }
-      if (!str_util::ConsumePrefix(&fullname, ":")) {
+      if (!absl::ConsumePrefix(&fullname, ":")) {
         p->has_id = false;
       } else {
-        p->has_id = !str_util::ConsumePrefix(&fullname, "*");
+        p->has_id = !absl::ConsumePrefix(&fullname, "*");
         if (p->has_id && !ConsumeNumber(&fullname, &p->id)) {
           return false;
         }
@@ -156,21 +156,21 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
     }
 
     // Handle legacy naming convention for cpu and gpu.
-    if (str_util::ConsumePrefix(&fullname, "/cpu:") ||
-        str_util::ConsumePrefix(&fullname, "/CPU:")) {
+    if (absl::ConsumePrefix(&fullname, "/cpu:") ||
+        absl::ConsumePrefix(&fullname, "/CPU:")) {
       p->has_type = true;
       p->type = "CPU";  // Treat '/cpu:..' as uppercase '/device:CPU:...'
-      p->has_id = !str_util::ConsumePrefix(&fullname, "*");
+      p->has_id = !absl::ConsumePrefix(&fullname, "*");
       if (p->has_id && !ConsumeNumber(&fullname, &p->id)) {
         return false;
       }
       progress = true;
     }
-    if (str_util::ConsumePrefix(&fullname, "/gpu:") ||
-        str_util::ConsumePrefix(&fullname, "/GPU:")) {
+    if (absl::ConsumePrefix(&fullname, "/gpu:") ||
+        absl::ConsumePrefix(&fullname, "/GPU:")) {
       p->has_type = true;
       p->type = "GPU";  // Treat '/gpu:..' as uppercase '/device:GPU:...'
-      p->has_id = !str_util::ConsumePrefix(&fullname, "*");
+      p->has_id = !absl::ConsumePrefix(&fullname, "*");
       if (p->has_id && !ConsumeNumber(&fullname, &p->id)) {
         return false;
       }
@@ -469,7 +469,7 @@ bool DeviceNameUtils::ParseLocalName(StringPiece name, ParsedName* p) {
     return false;
   }
   p->has_type = true;
-  if (!str_util::ConsumePrefix(&name, ":")) {
+  if (!absl::ConsumePrefix(&name, ":")) {
     return false;
   }
   if (!ConsumeNumber(&name, &p->id)) {
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index 49f7fe4ac20..49bce7a82fc 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -57,7 +57,8 @@ bool RoundTripPartialName(int parts_to_test, const std::vector<string>& parts,
       strings::StrAppend(&expected, "/device:", parts[3]);
     } else {
       strings::StrAppend(&original, "/", parts[3]);
-      strings::StrAppend(&expected, "/device:", str_util::Uppercase(parts[3]));
+      strings::StrAppend(&expected,
+                         "/device:", absl::AsciiStrToUpper(parts[3]));
     }
   }
   return RoundTripParsedName(original, expected);
@@ -408,8 +409,7 @@ static void MergeDevNamesError(const string& name_a, const string& name_b,
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(str_util::StrContains(s.error_message(), expected_error_substr))
-      << s;
+  EXPECT_TRUE(absl::StrContains(s.error_message(), expected_error_substr)) << s;
 }
 
 static void MergeOverrideHelper(const string& target, const string& name,
@@ -511,11 +511,11 @@ TEST(DeviceNameUtilsTest, MergeOverrideDevNames) {
 TEST(DeviceNameUtilsTest, GetNamesForDeviceMappings) {
   DeviceNameUtils::ParsedName p =
       Name("/job:foo/replica:10/task:0/device:GPU:1");
-  EXPECT_EQ(str_util::Join(DeviceNameUtils::GetNamesForDeviceMappings(p), ","),
+  EXPECT_EQ(absl::StrJoin(DeviceNameUtils::GetNamesForDeviceMappings(p), ","),
             "/job:foo/replica:10/task:0/device:GPU:1,"
             "/job:foo/replica:10/task:0/gpu:1");
   p.has_task = false;
-  EXPECT_EQ(str_util::Join(DeviceNameUtils::GetNamesForDeviceMappings(p), ","),
+  EXPECT_EQ(absl::StrJoin(DeviceNameUtils::GetNamesForDeviceMappings(p), ","),
             "");
 }
 
diff --git a/tensorflow/core/util/env_var.cc b/tensorflow/core/util/env_var.cc
index 2604a5d66a5..5c5ad02557b 100644
--- a/tensorflow/core/util/env_var.cc
+++ b/tensorflow/core/util/env_var.cc
@@ -32,7 +32,7 @@ Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
   if (tf_env_var_val == nullptr) {
     return Status::OK();
   }
-  string str_value = str_util::Lowercase(tf_env_var_val);
+  string str_value = absl::AsciiStrToLower(tf_env_var_val);
   if (str_value == "0" || str_value == "false") {
     *value = false;
     return Status::OK();
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index b87dce0dff5..2937e9602f8 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -145,7 +145,7 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
 
   int first_control_input = actual.input_size();
   for (int i = 0; i < actual.input_size(); ++i) {
-    if (str_util::StartsWith(actual.input(i), "^")) {
+    if (absl::StartsWith(actual.input(i), "^")) {
       first_control_input = i;
       break;
     }
@@ -241,7 +241,7 @@ uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
   // Normal inputs. Order important.
   int first_control_input = ndef.input_size();
   for (int i = 0; i < ndef.input_size(); ++i) {
-    if (str_util::StartsWith(ndef.input(i), "^")) {
+    if (absl::StartsWith(ndef.input(i), "^")) {
       first_control_input = i;
       break;
     }
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index 7230150c899..d44f7b48a7c 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -35,15 +35,27 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda.h"
 #endif
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_cuda_alias.h"
 
 namespace tensorflow {
 
+// According to HIP developer guide at
+// https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#assert
+// assert is not supported by HIP. While we are waiting for assert support in
+// hip kernels, the assert call should be macroed to NOP so that it does not
+// block us from creating a debug build
+#if TENSORFLOW_USE_ROCM
+#undef assert
+#define assert(x) \
+  {}
+#endif
+
 namespace detail {
 
 // Helper for range-based for loop using 'delta' increments.
-// Usage: see CudaGridRange?() functions below.
+// Usage: see GpuGridRange?() functions below.
 template <typename T>
-class CudaGridRange {
+class GpuGridRange {
   struct Iterator {
     __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
     __device__ T operator*() const { return index_; }
@@ -71,7 +83,7 @@ class CudaGridRange {
   };
 
  public:
-  __device__ CudaGridRange(T begin, T delta, T end)
+  __device__ GpuGridRange(T begin, T delta, T end)
       : begin_(begin), delta_(delta), end_(end) {}
 
   __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
@@ -83,39 +95,49 @@ class CudaGridRange {
   T end_;
 };
 
+#ifndef TENSORFLOW_USE_ROCM
+template <typename... T>
+using CudaGridRange = GpuGridRange<T...>;
+#endif
 }  // namespace detail
 
 // Helper to visit indices in the range 0 <= i < count, using the x-coordinate
 // of the global thread index. That is, each index i is visited by all threads
 // with the same x-coordinate.
-// Usage: for(int i : CudaGridRangeX(count)) { visit(i); }
+// Usage: for(int i : GpuGridRangeX(count)) { visit(i); }
 template <typename T>
-__device__ detail::CudaGridRange<T> CudaGridRangeX(T count) {
-  return detail::CudaGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
-                                  gridDim.x * blockDim.x, count);
+__device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
+  return detail::GpuGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
+                                 gridDim.x * blockDim.x, count);
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeX, CudaGridRangeX);
 
 // Helper to visit indices in the range 0 <= i < count using the y-coordinate.
-// Usage: for(int i : CudaGridRangeY(count)) { visit(i); }
+// Usage: for(int i : GpuGridRangeY(count)) { visit(i); }
 template <typename T>
-__device__ detail::CudaGridRange<T> CudaGridRangeY(T count) {
-  return detail::CudaGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
-                                  gridDim.y * blockDim.y, count);
+__device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
+  return detail::GpuGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
+                                 gridDim.y * blockDim.y, count);
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeY, CudaGridRangeY);
 
 // Helper to visit indices in the range 0 <= i < count using the z-coordinate.
-// Usage: for(int i : CudaGridRangeZ(count)) { visit(i); }
+// Usage: for(int i : GpuGridRangeZ(count)) { visit(i); }
 template <typename T>
-__device__ detail::CudaGridRange<T> CudaGridRangeZ(T count) {
-  return detail::CudaGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
-                                  gridDim.z * blockDim.z, count);
+__device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
+  return detail::GpuGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
+                                 gridDim.z * blockDim.z, count);
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuGridRangeZ, CudaGridRangeZ);
 
 // Mask for all 32 threads in a warp.
-const unsigned kCudaWarpAll = 0xffffffff;
+__device__ const unsigned kCudaWarpAll = 0xffffffff;
+// ROCM TODO add ROCM implementation
+// Mask for all 64 threads in a wavefront.
+__device__ const unsigned kGpuWarpAll = 0xffffffff;
 
 // Returns the warp lane ID of the calling thread
-__device__ inline unsigned CudaLaneId() {
+__device__ inline unsigned GpuLaneId() {
   unsigned int lane_id;
 #if GOOGLE_CUDA
 #if __clang__
@@ -124,10 +146,11 @@ __device__ inline unsigned CudaLaneId() {
   asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
 #endif  // __clang__
 #elif TENSORFLOW_USE_ROCM
-  land_id = __lane_id();
+  lane_id = __lane_id();
 #endif
   return lane_id;
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLaneId, CudaLaneId);
 
 namespace detail {
 // Returns true if mask is a valid parameter for __shfl*sync to return a well
@@ -141,9 +164,9 @@ namespace detail {
 // On Volta, for some invalid masks, this function hangs or returns false
 // positives, because the implementation shuffles with the same mask that
 // we are validating. Run on Pascal if you suspect that the mask is incorrect.
-__device__ inline bool CudaValidateShuffleSyncMask(unsigned mask,
-                                                   unsigned src_lane) {
-  unsigned src_dst_mask = 1u << CudaLaneId() | 1u << src_lane;
+__device__ inline bool GpuValidateShuffleSyncMask(unsigned mask,
+                                                  unsigned src_lane) {
+  unsigned src_dst_mask = 1u << GpuLaneId() | 1u << src_lane;
 #if CUDA_VERSION >= 9000
   unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
 #else
@@ -156,43 +179,51 @@ __device__ inline bool CudaValidateShuffleSyncMask(unsigned mask,
 #endif
   return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuValidateShuffleSyncMask,
+                                  CudaValidateShuffleSyncMask);
 
 // Returns the actual source lane for shuffle.
-__device__ inline unsigned CudaShuffleGetSrcLane(int src_lane, int width) {
-  int lane_id = CudaLaneId();
+__device__ inline unsigned GpuShuffleGetSrcLane(int src_lane, int width) {
+  int lane_id = GpuLaneId();
   int lane_base = lane_id & ~width + 1;
   int lane_offset = src_lane & width - 1;
   return lane_base + lane_offset;
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleGetSrcLane, CudaShuffleGetSrcLane);
 
 // Returns the source lane for shuffle up.
-__device__ inline unsigned CudaShuffleUpGetSrcLane(unsigned delta, int width) {
-  unsigned lane_id = CudaLaneId();
+__device__ inline unsigned GpuShuffleUpGetSrcLane(unsigned delta, int width) {
+  unsigned lane_id = GpuLaneId();
   if ((lane_id & width - 1) < delta) {
     return lane_id;
   }
   return lane_id - delta;
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpGetSrcLane,
+                                  CudaShuffleUpGetSrcLane);
 
 // Returns the source lane for shuffle down.
-__device__ inline unsigned CudaShuffleDownGetSrcLane(unsigned delta,
-                                                     int width) {
-  unsigned lane_id = CudaLaneId();
+__device__ inline unsigned GpuShuffleDownGetSrcLane(unsigned delta, int width) {
+  unsigned lane_id = GpuLaneId();
   if ((lane_id & width - 1) + delta >= width) {
     return lane_id;
   }
   return lane_id + delta;
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownGetSrcLane,
+                                  CudaShuffleDownGetSrcLane);
 
 // Returns the source lane for shuffle xor.
-__device__ inline unsigned CudaShuffleXorGetSrcLane(int lane_mask, int width) {
-  int lane_id = CudaLaneId();
+__device__ inline unsigned GpuShuffleXorGetSrcLane(int lane_mask, int width) {
+  int lane_id = GpuLaneId();
   int src_lane = lane_id ^ lane_mask;
   if (src_lane > (lane_id | width - 1)) {
     return lane_id;
   }
   return src_lane;
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorGetSrcLane,
+                                  CudaShuffleXorGetSrcLane);
 }  // namespace detail
 
 // For all *_sync wrappers below, it is illegal to synchronize threads from
@@ -205,54 +236,58 @@ __device__ inline unsigned CudaShuffleXorGetSrcLane(int lane_mask, int width) {
 // must have their corresponding bit in 'mask' set.
 
 // Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
-__device__ inline void CudaSyncWarp(unsigned mask = kCudaWarpAll) {
-  assert(mask & 1u << CudaLaneId());
+__device__ inline void GpuSyncWarp(unsigned mask = kCudaWarpAll) {
+  assert(mask & 1u << GpuLaneId());
 #if CUDA_VERSION >= 9000
   __syncwarp(mask);
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuSyncWarp, CudaSyncWarp);
 
 // Wrapper for __ballot_sync. All threads in 'mask' must call this function in
 // convergence, see comment above for details.
-__device__ inline unsigned CudaBallotSync(unsigned mask, int pred) {
-  assert(mask & 1u << CudaLaneId());
+__device__ inline unsigned GpuBallotSync(unsigned mask, int pred) {
+  assert(mask & 1u << GpuLaneId());
 #if CUDA_VERSION >= 9000
   return __ballot_sync(mask, pred);
 #else
   return __ballot(pred) & mask;  // Apply mask to match __ballot_sync's spec.
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuBallotSync, CudaBallotSync);
 
 // Wrapper for __any_sync. All threads in 'mask' must call this function in
 // convergence, see comment above for details.
-__device__ inline int CudaAnySync(unsigned mask, int pred) {
-  assert(mask & 1u << CudaLaneId());
+__device__ inline int GpuAnySync(unsigned mask, int pred) {
+  assert(mask & 1u << GpuLaneId());
 #if CUDA_VERSION >= 9000
   return __any_sync(mask, pred);
 #else
   return __any(pred);
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAnySync, CudaAnySync);
 
 // Wrapper for __all_sync. All threads in 'mask' must call this function in
 // convergence, see comment above for details.
-__device__ inline int CudaAllSync(unsigned mask, int pred) {
-  assert(mask & 1u << CudaLaneId());
+__device__ inline int GpuAllSync(unsigned mask, int pred) {
+  assert(mask & 1u << GpuLaneId());
 #if CUDA_VERSION >= 9000
   return __all_sync(mask, pred);
 #else
   return __all(pred);
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAllSync, CudaAllSync);
 
 // Wrapper for __shfl_sync. All threads in 'mask' must call this function in
 // convergence, see comment above for details.
 template <typename T>
-__device__ T CudaShuffleSync(unsigned mask, T value, int src_lane,
-                             int width = warpSize) {
+__device__ T GpuShuffleSync(unsigned mask, T value, int src_lane,
+                            int width = warpSize) {
   assert(!(width & width - 1));
-  assert(detail::CudaValidateShuffleSyncMask(
-      mask, detail::CudaShuffleGetSrcLane(src_lane, width)));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleGetSrcLane(src_lane, width)));
 #if CUDA_VERSION >= 9000
   return __shfl_sync(mask, value, src_lane, width);
 #else
@@ -263,14 +298,14 @@ __device__ T CudaShuffleSync(unsigned mask, T value, int src_lane,
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // See b/69446944.
-__device__ inline double CudaShuffleSync(unsigned mask, double value,
-                                         int src_lane, int width = warpSize) {
+__device__ inline double GpuShuffleSync(unsigned mask, double value,
+                                        int src_lane, int width = warpSize) {
 #if GOOGLE_CUDA
   auto tmp = __double_as_longlong(value);
   auto lo = static_cast<unsigned>(tmp);
   auto hi = static_cast<unsigned>(tmp >> 32);
-  hi = CudaShuffleSync(mask, hi, src_lane, width);
-  lo = CudaShuffleSync(mask, lo, src_lane, width);
+  hi = GpuShuffleSync(mask, hi, src_lane, width);
+  lo = GpuShuffleSync(mask, lo, src_lane, width);
   return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
 #elif TENSORFLOW_USE_ROCM
   auto tmp = static_cast<uint64_t>(value);
@@ -282,15 +317,16 @@ __device__ inline double CudaShuffleSync(unsigned mask, double value,
                              static_cast<uint64_t>(lo));
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleSync, CudaShuffleSync);
 
 // Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
 // convergence, see comment above for details.
 template <typename T>
-__device__ inline T CudaShuffleUpSync(unsigned mask, T value, unsigned delta,
-                                      int width = warpSize) {
+__device__ inline T GpuShuffleUpSync(unsigned mask, T value, unsigned delta,
+                                     int width = warpSize) {
   assert(!(width & width - 1));
-  assert(detail::CudaValidateShuffleSyncMask(
-      mask, detail::CudaShuffleUpGetSrcLane(delta, width)));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleUpGetSrcLane(delta, width)));
 #if CUDA_VERSION >= 9000
   return __shfl_up_sync(mask, value, delta, width);
 #else
@@ -301,15 +337,15 @@ __device__ inline T CudaShuffleUpSync(unsigned mask, T value, unsigned delta,
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // See b/69446944.
-__device__ inline double CudaShuffleUpSync(unsigned mask, double value,
-                                           unsigned delta,
-                                           int width = warpSize) {
+__device__ inline double GpuShuffleUpSync(unsigned mask, double value,
+                                          unsigned delta,
+                                          int width = warpSize) {
 #if GOOGLE_CUDA
   auto tmp = __double_as_longlong(value);
   auto lo = static_cast<unsigned>(tmp);
   auto hi = static_cast<unsigned>(tmp >> 32);
-  hi = CudaShuffleUpSync(mask, hi, delta, width);
-  lo = CudaShuffleUpSync(mask, lo, delta, width);
+  hi = GpuShuffleUpSync(mask, hi, delta, width);
+  lo = GpuShuffleUpSync(mask, lo, delta, width);
   return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
 #elif TENSORFLOW_USE_ROCM
   auto tmp = static_cast<uint64_t>(value);
@@ -321,15 +357,16 @@ __device__ inline double CudaShuffleUpSync(unsigned mask, double value,
                              static_cast<uint64_t>(lo));
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleUpSync, CudaShuffleUpSync);
 
 // Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
 // in convergence, see comment above for details.
 template <typename T>
-__device__ inline T CudaShuffleDownSync(unsigned mask, T value, unsigned delta,
-                                        int width = warpSize) {
+__device__ inline T GpuShuffleDownSync(unsigned mask, T value, unsigned delta,
+                                       int width = warpSize) {
   assert(!(width & width - 1));
-  assert(detail::CudaValidateShuffleSyncMask(
-      mask, detail::CudaShuffleDownGetSrcLane(delta, width)));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleDownGetSrcLane(delta, width)));
 #if CUDA_VERSION >= 9000
   return __shfl_down_sync(mask, value, delta, width);
 #else
@@ -340,15 +377,15 @@ __device__ inline T CudaShuffleDownSync(unsigned mask, T value, unsigned delta,
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // See b/69446944.
-__device__ inline double CudaShuffleDownSync(unsigned mask, double value,
-                                             unsigned delta,
-                                             int width = warpSize) {
+__device__ inline double GpuShuffleDownSync(unsigned mask, double value,
+                                            unsigned delta,
+                                            int width = warpSize) {
 #if GOOGLE_CUDA
   auto tmp = __double_as_longlong(value);
   auto lo = static_cast<unsigned>(tmp);
   auto hi = static_cast<unsigned>(tmp >> 32);
-  hi = CudaShuffleDownSync(mask, hi, delta, width);
-  lo = CudaShuffleDownSync(mask, lo, delta, width);
+  hi = GpuShuffleDownSync(mask, hi, delta, width);
+  lo = GpuShuffleDownSync(mask, lo, delta, width);
   return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
 #elif TENSORFLOW_USE_ROCM
   auto tmp = static_cast<uint64_t>(value);
@@ -360,15 +397,16 @@ __device__ inline double CudaShuffleDownSync(unsigned mask, double value,
                              static_cast<uint64_t>(lo));
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleDownSync, CudaShuffleDownSync);
 
 // Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
 // convergence, see comment above for details.
 template <typename T>
-__device__ T CudaShuffleXorSync(unsigned mask, T value, int lane_mask,
-                                int width = warpSize) {
+__device__ T GpuShuffleXorSync(unsigned mask, T value, int lane_mask,
+                               int width = warpSize) {
   assert(!(width & width - 1));
-  assert(detail::CudaValidateShuffleSyncMask(
-      mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
 #if GOOGLE_CUDA
 #if CUDA_VERSION >= 9000
   return __shfl_xor_sync(mask, value, lane_mask, width);
@@ -376,6 +414,7 @@ __device__ T CudaShuffleXorSync(unsigned mask, T value, int lane_mask,
   return __shfl_xor(value, lane_mask, width);
 #endif
 #elif TENSORFLOW_USE_ROCM
+  // ROCM TODO: check if HIP should be changed to cope with more types
   return __shfl_xor(static_cast<int>(value), lane_mask, width);
 #endif
 }
@@ -386,8 +425,8 @@ __device__ inline Eigen::half GpuShuffleXorSync(unsigned mask,
                                                 int lane_mask,
                                                 int width = warpSize) {
   assert(!(width & width - 1));
-  assert(detail::CudaValidateShuffleSyncMask(
-      mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
+  assert(detail::GpuValidateShuffleSyncMask(
+      mask, detail::GpuShuffleXorGetSrcLane(lane_mask, width)));
   // TODO(rocm): This doesn't preserve NaN payload and flushes denorms to zero,
   // maybe this should be implemented differently?
   return static_cast<Eigen::half>(
@@ -398,15 +437,15 @@ __device__ inline Eigen::half GpuShuffleXorSync(unsigned mask,
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // See b/69446944.
-__device__ inline double CudaShuffleXorSync(unsigned mask, double value,
-                                            int lane_mask,
-                                            int width = warpSize) {
+__device__ inline double GpuShuffleXorSync(unsigned mask, double value,
+                                           int lane_mask,
+                                           int width = warpSize) {
 #if GOOGLE_CUDA
   auto tmp = __double_as_longlong(value);
   auto lo = static_cast<unsigned>(tmp);
   auto hi = static_cast<unsigned>(tmp >> 32);
-  hi = CudaShuffleXorSync(mask, hi, lane_mask, width);
-  lo = CudaShuffleXorSync(mask, lo, lane_mask, width);
+  hi = GpuShuffleXorSync(mask, hi, lane_mask, width);
+  lo = GpuShuffleXorSync(mask, lo, lane_mask, width);
   return __longlong_as_double(static_cast<uint64_t>(hi) << 32 | lo);
 #elif TENSORFLOW_USE_ROCM
   auto tmp = static_cast<uint64_t>(value);
@@ -418,10 +457,11 @@ __device__ inline double CudaShuffleXorSync(unsigned mask, double value,
                              static_cast<uint64_t>(lo));
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuShuffleXorSync, CudaShuffleXorSync);
 
 // Wrapper for __ldg.
 template <typename T>
-__host__ __device__ T CudaLdg(const T* address) {
+__host__ __device__ T GpuLdg(const T* address) {
 #if __CUDA_ARCH__ >= 350
   return __ldg(address);
 #else
@@ -429,11 +469,11 @@ __host__ __device__ T CudaLdg(const T* address) {
 #endif
 }
 
-__host__ __device__ inline bool CudaLdg(const bool* address) {
-  return CudaLdg(reinterpret_cast<const char*>(address)) != 0;
+__host__ __device__ inline bool GpuLdg(const bool* address) {
+  return GpuLdg(reinterpret_cast<const char*>(address)) != 0;
 }
 
-__host__ __device__ inline std::complex<float> CudaLdg(
+__host__ __device__ inline std::complex<float> GpuLdg(
     const std::complex<float>* address) {
 #if __CUDA_ARCH__ >= 350
   float2 mem = __ldg(reinterpret_cast<const float2*>(address));
@@ -443,7 +483,7 @@ __host__ __device__ inline std::complex<float> CudaLdg(
 #endif
 }
 
-__host__ __device__ inline std::complex<double> CudaLdg(
+__host__ __device__ inline std::complex<double> GpuLdg(
     const std::complex<double>* address) {
 #if __CUDA_ARCH__ >= 350
   double2 mem = __ldg(reinterpret_cast<const double2*>(address));
@@ -452,6 +492,7 @@ __host__ __device__ inline std::complex<double> CudaLdg(
   return *address;
 #endif
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuLdg, CudaLdg);
 
 // Zeroes count elements starting at ptr using all threads of a 1-D grid.
 // Note: this function does not synchronize, and therefore the memory range is
@@ -462,7 +503,7 @@ __global__ void SetZero(const int count, T* ptr) {
   assert(blockDim.y == 1);
   assert(blockDim.z == 1);
   assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
-  for (int i : CudaGridRangeX(count)) {
+  for (int i : GpuGridRangeX(count)) {
     ptr[i] = T(0);
   }
 }
@@ -474,7 +515,7 @@ __global__ void SetToValue(const int count, T* ptr, T value) {
   assert(blockDim.y == 1);
   assert(blockDim.z == 1);
   assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
-  for (int i : CudaGridRangeX(count)) {
+  for (int i : GpuGridRangeX(count)) {
     ptr[i] = value;
   }
 }
@@ -482,7 +523,7 @@ __global__ void SetToValue(const int count, T* ptr, T value) {
 namespace detail {
 // Helper function for atomic accumulation implemented as CAS.
 template <typename T, typename F>
-__device__ T CudaAtomicCasHelper(T* ptr, F accumulate) {
+__device__ T GpuAtomicCasHelper(T* ptr, F accumulate) {
   T old = *ptr;
   T assumed;
   do {
@@ -491,31 +532,32 @@ __device__ T CudaAtomicCasHelper(T* ptr, F accumulate) {
   } while (assumed != old);
   return old;
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicCasHelper, CudaAtomicCasHelper);
 
 // Overload for floating point (using integer comparison to handle NaN
 // correctly).
 template <typename F>
-__device__ float CudaAtomicCasHelper(float* ptr, F accumulate) {
+__device__ float GpuAtomicCasHelper(float* ptr, F accumulate) {
   return __float_as_int(
-      CudaAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
+      GpuAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
         return __float_as_int(accumulate(__int_as_float(a)));
       }));
 }
 template <typename F>
-__device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
+__device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
 #if TENSORFLOW_USE_ROCM
   // FIXME: remove the workaround below once bug is fixed.
   // HIP has a bug in the implementation of __longlong_as_double
   // So workaround it by using reinterpret_cast<double*>.
   uint64_t result =
-      CudaAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
-                          [accumulate](tensorflow::uint64 a) {
-                            return __double_as_longlong(
-                                accumulate(*(reinterpret_cast<double*>(&a))));
-                          });
+      GpuAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
+                         [accumulate](tensorflow::uint64 a) {
+                           return __double_as_longlong(
+                               accumulate(*(reinterpret_cast<double*>(&a))));
+                         });
   return *(reinterpret_cast<double*>(&result));
 #else
-  return __longlong_as_double(CudaAtomicCasHelper(
+  return __longlong_as_double(GpuAtomicCasHelper(
       reinterpret_cast<tensorflow::uint64*>(ptr),
       [accumulate](tensorflow::uint64 a) {
         return __double_as_longlong(accumulate(__longlong_as_double(a)));
@@ -536,7 +578,7 @@ __device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
 //
 // Note: Assumes little endian.
 template <typename F>
-__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
+__device__ Eigen::half GpuAtomicCasHelper(Eigen::half* ptr, F accumulate) {
 #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
   static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
 #endif
@@ -546,7 +588,7 @@ __device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
   if (intptr & 0x2) {
     // The half is in the second part of the uint32 (upper 16 bits).
     uint32* address = reinterpret_cast<uint32*>(intptr - 2);
-    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+    uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
       unsigned short high = static_cast<unsigned short>(arg >> 16);
       Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
       return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
@@ -555,7 +597,7 @@ __device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
   } else {
     // The half is in the first part of the uint32 (lower 16 bits).
     uint32* address = reinterpret_cast<uint32*>(intptr);
-    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+    uint32 result = GpuAtomicCasHelper(address, [accumulate](uint32 arg) {
       unsigned short low = static_cast<unsigned short>(arg & 0xffff);
       Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
       return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
@@ -574,74 +616,78 @@ using ToTypeIfConvertible =
 // for some ops and provide implementation for all reasonable types.
 
 template <typename T, typename U>
-__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
   return atomicAdd(ptr, value);
 }
 
-__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
+                                           Eigen::half value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](Eigen::half a) { return a + value; });
 }
 
-#if __CUDA_ARCH__ < 600
-__device__ inline double CudaAtomicAdd(double* ptr, double value) {
-  return detail::CudaAtomicCasHelper(ptr,
-                                     [value](double a) { return a + value; });
+#if (__CUDA_ARCH__ < 600) || TENSORFLOW_USE_ROCM
+__device__ inline double GpuAtomicAdd(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](double a) { return a + value; });
 }
 #endif
 
-// CudaAtomicAdd
-// Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does
+// GpuAtomicAdd
+// Specializations of GpuAtomicAdd for complex types, which GpuAtomicAdd does
 // not support. We treat a std::complex<T>* as a T* (the C++ standard section
 // 26.4.4 allows this explicitly) and atomic add the real and imaginary
 // components individually. The operation as a whole is not atomic, but we can
 // safely treat the components independently for the purpose of accumulating.
+
+// ROCM TODO support GpuAtomicAdd for std::complex<>
 #if GOOGLE_CUDA
-__device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr,
-                                                    std::complex<float> value) {
+__device__ inline std::complex<float> GpuAtomicAdd(std::complex<float>* ptr,
+                                                   std::complex<float> value) {
   auto ptr_scalar = reinterpret_cast<float*>(ptr);
-  return std::complex<float>(CudaAtomicAdd(ptr_scalar, value.real()),
-                             CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+  return std::complex<float>(GpuAtomicAdd(ptr_scalar, value.real()),
+                             GpuAtomicAdd(ptr_scalar + 1, value.imag()));
 }
 
-__device__ inline std::complex<double> CudaAtomicAdd(
+__device__ inline std::complex<double> GpuAtomicAdd(
     std::complex<double>* ptr, std::complex<double> value) {
   auto ptr_scalar = reinterpret_cast<double*>(ptr);
-  return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()),
-                              CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+  return std::complex<double>(GpuAtomicAdd(ptr_scalar, value.real()),
+                              GpuAtomicAdd(ptr_scalar + 1, value.imag()));
 }
 #endif
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicAdd, CudaAtomicAdd);
 
-// CudaAtomicSub
+// GpuAtomicSub
 template <typename T, typename U>
-__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicSub(T* ptr, U value) {
   return atomicSub(ptr, value);
 }
 
-// Specializations of subtraction which add the negative value.
-__device__ inline float CudaAtomicSub(float* ptr, float value) {
-  return CudaAtomicAdd(ptr, -value);
+// Specializations of substraction which add the negative value.
+__device__ inline float GpuAtomicSub(float* ptr, float value) {
+  return GpuAtomicAdd(ptr, -value);
 }
 
-__device__ inline double CudaAtomicSub(double* ptr, double value) {
-  return CudaAtomicAdd(ptr, -value);
+__device__ inline double GpuAtomicSub(double* ptr, double value) {
+  return GpuAtomicAdd(ptr, -value);
 }
 
-__device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
-                                                   tensorflow::uint64 value) {
-  return CudaAtomicAdd(ptr, -value);
+__device__ inline tensorflow::uint64 GpuAtomicSub(tensorflow::uint64* ptr,
+                                                  tensorflow::uint64 value) {
+  return GpuAtomicAdd(ptr, -value);
 }
 
-__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline Eigen::half GpuAtomicSub(Eigen::half* ptr,
+                                           Eigen::half value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](Eigen::half a) { return a - value; });
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
 
-// CudaAtomicMax
+// GpuAtomicMax
 template <typename T, typename U>
-__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
   return atomicMax(ptr, value);
 }
 
@@ -660,47 +706,48 @@ __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
  *
  */
 
-__device__ inline float CudaAtomicMax(float* ptr, float value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline float GpuAtomicMax(float* ptr, float value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](float a) { return fmaxf(a, value); });
 }
 
-__device__ inline double CudaAtomicMax(double* ptr, double value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline double GpuAtomicMax(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](double a) { return fmax(a, value); });
 }
 
 #else
 
-__device__ inline float CudaAtomicMax(float* ptr, float value) {
-  return detail::CudaAtomicCasHelper(
-      ptr, [value](float a) { return max(a, value); });
+__device__ inline float GpuAtomicMax(float* ptr, float value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](float a) { return max(a, value); });
 }
 
-__device__ inline double CudaAtomicMax(double* ptr, double value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline double GpuAtomicMax(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](double a) { return max(a, value); });
 }
 
 #endif
 
-__device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
+                                           Eigen::half value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](Eigen::half a) { return max(a, value); });
 }
 
 #if __CUDA_ARCH__ < 320
-__device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
-                                                   tensorflow::uint64 value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
+                                                  tensorflow::uint64 value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](tensorflow::uint64 a) { return max(a, value); });
 }
 #endif
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
 
-// CudaAtomicMin
+// GpuAtomicMin
 template <typename T, typename U>
-__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) {
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
   return atomicMin(ptr, value);
 }
 
@@ -719,55 +766,58 @@ __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) {
  *
  */
 
-__device__ inline float CudaAtomicMin(float* ptr, float value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline float GpuAtomicMin(float* ptr, float value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](float a) { return fminf(a, value); });
 }
 
-__device__ inline double CudaAtomicMin(double* ptr, double value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline double GpuAtomicMin(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](double a) { return fmin(a, value); });
 }
 
 #else
 
-__device__ inline float CudaAtomicMin(float* ptr, float value) {
-  return detail::CudaAtomicCasHelper(
-      ptr, [value](float a) { return min(a, value); });
+__device__ inline float GpuAtomicMin(float* ptr, float value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](float a) { return min(a, value); });
 }
 
-__device__ inline double CudaAtomicMin(double* ptr, double value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline double GpuAtomicMin(double* ptr, double value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](double a) { return min(a, value); });
 }
 
 #endif
 
-__device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr,
-                                            Eigen::half value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
+                                           Eigen::half value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](Eigen::half a) { return min(a, value); });
 }
 
 #if __CUDA_ARCH__ < 320
-__device__ inline tensorflow::uint64 CudaAtomicMin(tensorflow::uint64* ptr,
-                                                   tensorflow::uint64 value) {
-  return detail::CudaAtomicCasHelper(
+__device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
+                                                  tensorflow::uint64 value) {
+  return detail::GpuAtomicCasHelper(
       ptr, [value](tensorflow::uint64 a) { return min(a, value); });
 }
 #endif
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMin, CudaAtomicMin);
 
-// CudaAtomicMul
+// GpuAtomicMul
 template <typename T, typename U>
-__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
-  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMul(T* ptr, U value) {
+  return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a * value; });
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMul, CudaAtomicMul);
 
-// CudaAtomicDiv
+// GpuAtomicDiv
 template <typename T, typename U>
-__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
-  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
+__device__ detail::ToTypeIfConvertible<U, T> GpuAtomicDiv(T* ptr, U value) {
+  return detail::GpuAtomicCasHelper(ptr, [value](T a) { return a / value; });
 }
+CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicDiv, CudaAtomicDiv);
 
 // Operator overloads for complex numbers.
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h
index 71faa129e63..0404b5e67fe 100644
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@@ -21,6 +21,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda_fp16.h"
 #endif
+#include "tensorflow/core/util/gpu_cuda_alias.h"
 #include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_launch_config.h"
 
@@ -30,12 +31,17 @@ limitations under the License.
 #define TF_RED_WARPSIZE 64
 #endif
 
-// Deprecated, use 'for(int i : CudaGridRangeX(n))' instead.
+// Deprecated, use 'for(int i : GpuGridRangeX(n))' instead.
+#define GPU_1D_KERNEL_LOOP(i, n) \
+  for (int i : ::tensorflow::GpuGridRangeX<int>(n))
 #define CUDA_1D_KERNEL_LOOP(i, n) \
-  for (int i : ::tensorflow::CudaGridRangeX<int>(n))
-// Deprecated, use 'for(int i : CudaGridRange?(n))' instead.
+  for (int i : ::tensorflow::GpuGridRangeX<int>(n))
+
+// Deprecated, use 'for(int i : GpuGridRange?(n))' instead.
+#define GPU_AXIS_KERNEL_LOOP(i, n, axis) \
+  for (int i : ::tensorflow::GpuGridRange##axis<int>(n))
 #define CUDA_AXIS_KERNEL_LOOP(i, n, axis) \
-  for (int i : ::tensorflow::CudaGridRange##axis<int>(n))
+  for (int i : ::tensorflow::GpuGridRange##axis<int>(n))
 
 #if GOOGLE_CUDA
 #define gpuSuccess cudaSuccess
@@ -85,16 +91,17 @@ auto CudaLaunchKernel(Args&&... args)
   return GpuLaunchKernel(std::forward<Args>(args)...);
 }
 
-__host__ __device__ inline tensorflow::bfloat16 CudaLdg(
+__host__ __device__ inline tensorflow::bfloat16 GpuLdg(
     const tensorflow::bfloat16* address) {
   tensorflow::bfloat16 return_value;
-  return_value.value = CudaLdg(reinterpret_cast<const uint16_t*>(address));
+  return_value.value = GpuLdg(reinterpret_cast<const uint16_t*>(address));
   return return_value;
 }
+// Already aliased in gpu_device_functions.h
 
 template <typename T>
 __host__ __device__ inline T ldg(const T* ptr) {
-  return CudaLdg(ptr);
+  return GpuLdg(ptr);
 }
 
 template <typename T>
@@ -121,32 +128,39 @@ __host__ __device__ inline double tf_max(double x, double y) {
   return fmax(x, y);
 }
 
-__device__ inline Eigen::half CudaShuffleSync(unsigned mask, Eigen::half value,
-                                              int src_lane,
-                                              int width = warpSize) {
+// ROCM TODO re-enable them after adding fp16 support logic
+#if GOOGLE_CUDA
+__device__ inline Eigen::half GpuShuffleSync(unsigned mask, Eigen::half value,
+                                             int src_lane,
+                                             int width = warpSize) {
   return Eigen::half(
-      CudaShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
+      GpuShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
 }
+// Aliased in gpu_device_functions.h
 
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleUpSync(
+__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleUpSync(
     unsigned mask, Eigen::half value, int delta, int width = warpSize) {
   return Eigen::half(
-      CudaShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
+      GpuShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
 }
+// Aliased in gpu_device_functions.h
 
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDownSync(
+__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleDownSync(
     unsigned mask, Eigen::half value, int delta, int width = warpSize) {
   return Eigen::half(
-      CudaShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
+      GpuShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
 }
+// Aliased in gpu_device_functions.h
 
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXorSync(
+__device__ EIGEN_ALWAYS_INLINE Eigen::half GpuShuffleXorSync(
     unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) {
   return Eigen::half(
-      CudaShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
+      GpuShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
 }
+// Aliased in gpu_device_functions.h
+#endif
 
-namespace cuda_helper {
+namespace gpu_helper {
 template <typename T, typename OutType = int32>
 __device__ OutType upper_bound(const T* first, OutType count, T val) {
   const T* orig = first;
@@ -187,7 +201,12 @@ __device__ OutType lower_bound(const T* first, OutType count, T val) {
   return first - orig;
 }
 
-}  // namespace cuda_helper
+}  // namespace gpu_helper
+
+#ifndef TENSORFLOW_USE_ROCM
+namespace cuda_helper = gpu_helper;
+#endif
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
index c3becb1509a..1633f9e8907 100644
--- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
@@ -54,7 +54,7 @@ __global__ void Count1D(GpuLaunchConfig config, int bufsize, int* outbuf) {
     atomicAdd(&outbuf[x % bufsize], 1);
   }
 }
-__global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
+__global__ void Count2D(Gpu2DLaunchConfig config, int bufsize, int* outbuf) {
   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
@@ -68,7 +68,7 @@ __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
     }
   }
 }
-__global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
+__global__ void Count3D(Gpu3DLaunchConfig config, int bufsize, int* outbuf) {
   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
@@ -189,7 +189,7 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
 #undef TEST_LAUNCH_PARAMETER
 }
 
-bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
+bool operator==(const Gpu2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
   return a.thread_per_block.x == b.thread_per_block.x &&
          a.thread_per_block.y == b.thread_per_block.y &&
          a.thread_per_block.z == b.thread_per_block.z &&
@@ -201,8 +201,8 @@ bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
          a.thread_per_block.z == b.thread_per_block.z;
 }
 
-TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
-  Cuda2DLaunchConfig cfg;
+TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
+  Gpu2DLaunchConfig cfg;
   GpuLaunchConfig cfg1d;
 
 // test valid inputs
@@ -212,7 +212,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
                                 outbuf));                                     \
   CUDA_ASSERT_SUCCESS                                                         \
-  cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
+  cfg = GetGpu2DLaunchConfig(dimx, dimy, d);                                  \
   TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
                                 bufsize, outbuf));                            \
@@ -224,7 +224,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
                                 outbuf));                                     \
   CUDA_ASSERT_SUCCESS                                                         \
-  cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
+  cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                   \
   TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
                                 bufsize, outbuf));                            \
@@ -245,8 +245,8 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
 #undef TEST_LAUNCH_PARAMETER
 }
 
-TEST_F(GpuLaunchConfigTest, GetCuda3DLaunchConfig) {
-  Cuda3DLaunchConfig cfg;
+TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
+  Gpu3DLaunchConfig cfg;
   GpuLaunchConfig cfg1d;
 
 // test valid inputs
@@ -256,7 +256,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda3DLaunchConfig) {
                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
                                 outbuf));                                     \
   CUDA_ASSERT_SUCCESS                                                         \
-  cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
+  cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);             \
   TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count,                     \
                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
                                 bufsize, outbuf));                            \
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 1da9ebfb77e..bcc139ab59d 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -142,10 +142,12 @@ inline GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
   config.block_count = block_count;
   return config;
 }
+#ifndef TENSORFLOW_USE_ROCM
 inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
                                             const Eigen::GpuDevice& d) {
   return GetGpuLaunchConfig(work_element_count, d);
 }
+#endif
 
 // Calculate the GPU launch config we should use for a kernel launch. This
 // variant takes the resource limits of func into account to maximize occupancy.
@@ -275,10 +277,12 @@ inline Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
       grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
   return config;
 }
+#ifndef TENSORFLOW_USE_ROCM
 inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
                                                 const Eigen::GpuDevice& d) {
   return GetGpu2DLaunchConfig(xdim, ydim, d);
 }
+#endif
 
 // Calculate the GPU 2D and 3D launch config we should use for a kernel launch.
 // This variant takes the resource limits of func into account to maximize
@@ -287,11 +291,11 @@ using Gpu3DLaunchConfig = Gpu2DLaunchConfig;
 CREATE_CUDA_TYPE_ALIAS(Gpu3DLaunchConfig, Cuda3DLaunchConfig);
 
 template <typename DeviceFunc>
-Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
-                                        const Eigen::GpuDevice& d,
-                                        DeviceFunc func,
-                                        size_t dynamic_shared_memory_size,
-                                        int block_size_limit) {
+Gpu3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
+                                       const Eigen::GpuDevice& d,
+                                       DeviceFunc func,
+                                       size_t dynamic_shared_memory_size,
+                                       int block_size_limit) {
   Gpu3DLaunchConfig config;
 
   if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
diff --git a/tensorflow/core/util/matmul_bcast_test.cc b/tensorflow/core/util/matmul_bcast_test.cc
index 1de62297f70..1aba88a45d5 100644
--- a/tensorflow/core/util/matmul_bcast_test.cc
+++ b/tensorflow/core/util/matmul_bcast_test.cc
@@ -28,9 +28,9 @@ string MatMulBCastToStr(const MatMulBCast& b) {
   }
   string ret;
   strings::StrAppend(
-      &ret, "[", str_util::Join(b.output_batch_shape().dim_sizes(), ","), "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.x_batch_indices(), ","), "]");
-  strings::StrAppend(&ret, "[", str_util::Join(b.y_batch_indices(), ","), "]");
+      &ret, "[", absl::StrJoin(b.output_batch_shape().dim_sizes(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.x_batch_indices(), ","), "]");
+  strings::StrAppend(&ret, "[", absl::StrJoin(b.y_batch_indices(), ","), "]");
   return ret;
 }
 
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index 00c527f1af6..b3ca943c903 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -247,7 +247,7 @@ Status MemmappedFileSystem::InitializeFromFile(Env* env,
 }
 
 bool MemmappedFileSystem::IsMemmappedPackageFilename(const string& filename) {
-  return str_util::StartsWith(filename, kMemmappedPackagePrefix);
+  return absl::StartsWith(filename, kMemmappedPackagePrefix);
 }
 
 namespace {
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index 890bd837025..6ca6582e20e 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 cc_library(
diff --git a/tensorflow/core/util/reffed_status_callback_test.cc b/tensorflow/core/util/reffed_status_callback_test.cc
index 6799183dc1f..cf75fc9ea62 100644
--- a/tensorflow/core/util/reffed_status_callback_test.cc
+++ b/tensorflow/core/util/reffed_status_callback_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <atomic>
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -57,9 +58,8 @@ TEST(TestReffedStatusCallback, CallsBackFail) {
   // Equal to the first error.
   EXPECT_EQ(status.code(), error::INTERNAL);
   // Both errors are reported.
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Internal: 1"));
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Invalid argument: 2"));
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Internal: 1"));
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Invalid argument: 2"));
 }
 
 TEST(TestReffedStatusCallback, RefMulti) {
@@ -80,8 +80,8 @@ TEST(TestReffedStatusCallback, RefMulti) {
   cb->Unref();  // Created by constructor.
   EXPECT_TRUE(called);
   // Both errors are reported.
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Internal: 1"));
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Internal: 2"));
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Internal: 1"));
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "Internal: 2"));
 }
 
 TEST(TestReffedStatusCallback, MultiThreaded) {
@@ -114,7 +114,7 @@ TEST(TestReffedStatusCallback, MultiThreaded) {
   EXPECT_EQ(num_called.load(), 1);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "Invalid argument: err"));
+      absl::StrContains(status.error_message(), "Invalid argument: err"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index a595c9509e6..02687095c9c 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -64,7 +64,7 @@ Status TestReporter::Initialize() {
     return Status::OK();
   }
   string mangled_fname = strings::StrCat(
-      fname_, str_util::Join(str_util::Split(test_name_, '/'), "__"));
+      fname_, absl::StrJoin(str_util::Split(test_name_, '/'), "__"));
   Env* env = Env::Default();
   if (env->FileExists(mangled_fname).ok()) {
     return errors::InvalidArgument("Cannot create TestReporter, file exists: ",
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 0972b86ea5f..4c06560b852 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(str_util::StrContains(s, expected))
+  EXPECT_TRUE(absl::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
diff --git a/tensorflow/core/util/semver_test.cc b/tensorflow/core/util/semver_test.cc
index fdc34fa58bd..1ba3cb24f15 100644
--- a/tensorflow/core/util/semver_test.cc
+++ b/tensorflow/core/util/semver_test.cc
@@ -34,7 +34,7 @@ bool IsDotOrIdentifierChar(char c) {
 
 bool ConsumeDotSeparatedIdentifiers(StringPiece* s, const string& prefix,
                                     StringPiece* val) {
-  if (!str_util::ConsumePrefix(s, prefix)) return false;
+  if (!absl::ConsumePrefix(s, prefix)) return false;
   size_t i;
   for (i = 0; i < s->size() && IsDotOrIdentifierChar((*s)[i]); ++i) {
     // Intentionally empty
@@ -54,16 +54,16 @@ TEST(SemverTest, VersionStringFollowsSemver) {
   StringPiece semver(TF_VERSION_STRING);
 
   ASSERT_TRUE(str_util::ConsumeLeadingDigits(&semver, &major));
-  ASSERT_TRUE(str_util::ConsumePrefix(&semver, "."));
+  ASSERT_TRUE(absl::ConsumePrefix(&semver, "."));
   ASSERT_TRUE(str_util::ConsumeLeadingDigits(&semver, &minor));
-  ASSERT_TRUE(str_util::ConsumePrefix(&semver, "."));
+  ASSERT_TRUE(absl::ConsumePrefix(&semver, "."));
   // Till 0.11.0rc2, the prerelease version was (incorrectly) not separated from
   // the patch version number. Let that slide.
   // Remove this when TF_VERSION_STRING moves beyond 0.11.0rc2.
   if (major == 0 && minor <= 11) {
     return;
   }
-  if (str_util::ConsumePrefix(&semver, "head")) {
+  if (absl::ConsumePrefix(&semver, "head")) {
     ASSERT_TRUE(semver.empty());
     return;
   }
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index 5578e426255..5ab0a3d084e 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -472,7 +472,7 @@ TEST(SparseTensorTest, SparseTensorGroup) {
   // All the hard work is right here!
   for (const auto& g : gi) {
     groups.push_back(g.group());
-    VLOG(1) << "Group: " << str_util::Join(g.group(), ",");
+    VLOG(1) << "Group: " << absl::StrJoin(g.group(), ",");
     VLOG(1) << "Indices: " << g.indices();
     VLOG(1) << "Values: " << g.values<int32>();
 
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index f40ec9b7522..91efede0269 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -3,16 +3,15 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
     "if_not_windows",
-    "tf_copts",
     "tf_cc_test",
+    "tf_copts",
 )
 
 # To be exported to tensorflow/core:mobile_srcs.
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 3709ee5ae30..111ccdc48f4 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/framework/versions.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -1033,6 +1034,7 @@ Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
       HANDLE_COPY(qint32)
       HANDLE_COPY(quint8)
       HANDLE_COPY(qint8)
+      HANDLE_COPY(bfloat16)
       default:
         return errors::InvalidArgument("Dtype ", DataTypeString(common_dtype),
                                        " not supported.");
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 9567e4750b7..082a69dc168 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -166,10 +166,10 @@ template <typename T>
 void TestBasic() {
   {
     BundleWriter writer(Env::Default(), Prefix("foo"));
-    TF_EXPECT_OK(writer.Add("foo_003", Constant_2x3<T>(3)));
-    TF_EXPECT_OK(writer.Add("foo_000", Constant_2x3<T>(0)));
-    TF_EXPECT_OK(writer.Add("foo_002", Constant_2x3<T>(2)));
-    TF_EXPECT_OK(writer.Add("foo_001", Constant_2x3<T>(1)));
+    TF_EXPECT_OK(writer.Add("foo_003", Constant_2x3(T(3))));
+    TF_EXPECT_OK(writer.Add("foo_000", Constant_2x3(T(0))));
+    TF_EXPECT_OK(writer.Add("foo_002", Constant_2x3(T(2))));
+    TF_EXPECT_OK(writer.Add("foo_001", Constant_2x3(T(1))));
     TF_ASSERT_OK(writer.Finish());
   }
   {
@@ -178,28 +178,28 @@ void TestBasic() {
     EXPECT_EQ(
         AllTensorKeys(&reader),
         std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
-    Expect<T>(&reader, "foo_000", Constant_2x3<T>(0));
-    Expect<T>(&reader, "foo_001", Constant_2x3<T>(1));
-    Expect<T>(&reader, "foo_002", Constant_2x3<T>(2));
-    Expect<T>(&reader, "foo_003", Constant_2x3<T>(3));
+    Expect<T>(&reader, "foo_000", Constant_2x3(T(0)));
+    Expect<T>(&reader, "foo_001", Constant_2x3(T(1)));
+    Expect<T>(&reader, "foo_002", Constant_2x3(T(2)));
+    Expect<T>(&reader, "foo_003", Constant_2x3(T(3)));
   }
   {
     BundleReader reader(Env::Default(), Prefix("foo"));
     TF_ASSERT_OK(reader.status());
-    ExpectNext<T>(&reader, Constant_2x3<T>(0));
-    ExpectNext<T>(&reader, Constant_2x3<T>(1));
-    ExpectNext<T>(&reader, Constant_2x3<T>(2));
-    ExpectNext<T>(&reader, Constant_2x3<T>(3));
+    ExpectNext<T>(&reader, Constant_2x3(T(0)));
+    ExpectNext<T>(&reader, Constant_2x3(T(1)));
+    ExpectNext<T>(&reader, Constant_2x3(T(2)));
+    ExpectNext<T>(&reader, Constant_2x3(T(3)));
     EXPECT_TRUE(reader.Valid());
     reader.Next();
     EXPECT_FALSE(reader.Valid());
   }
   {
     BundleWriter writer(Env::Default(), Prefix("bar"));
-    TF_EXPECT_OK(writer.Add("bar_003", Constant_2x3<T>(3)));
-    TF_EXPECT_OK(writer.Add("bar_000", Constant_2x3<T>(0)));
-    TF_EXPECT_OK(writer.Add("bar_002", Constant_2x3<T>(2)));
-    TF_EXPECT_OK(writer.Add("bar_001", Constant_2x3<T>(1)));
+    TF_EXPECT_OK(writer.Add("bar_003", Constant_2x3(T(3))));
+    TF_EXPECT_OK(writer.Add("bar_000", Constant_2x3(T(0))));
+    TF_EXPECT_OK(writer.Add("bar_002", Constant_2x3(T(2))));
+    TF_EXPECT_OK(writer.Add("bar_001", Constant_2x3(T(1))));
     TF_ASSERT_OK(writer.Finish());
   }
   {
@@ -208,18 +208,18 @@ void TestBasic() {
     EXPECT_EQ(
         AllTensorKeys(&reader),
         std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
-    Expect<T>(&reader, "bar_003", Constant_2x3<T>(3));
-    Expect<T>(&reader, "bar_002", Constant_2x3<T>(2));
-    Expect<T>(&reader, "bar_001", Constant_2x3<T>(1));
-    Expect<T>(&reader, "bar_000", Constant_2x3<T>(0));
+    Expect<T>(&reader, "bar_003", Constant_2x3(T(3)));
+    Expect<T>(&reader, "bar_002", Constant_2x3(T(2)));
+    Expect<T>(&reader, "bar_001", Constant_2x3(T(1)));
+    Expect<T>(&reader, "bar_000", Constant_2x3(T(0)));
   }
   {
     BundleReader reader(Env::Default(), Prefix("bar"));
     TF_ASSERT_OK(reader.status());
-    ExpectNext<T>(&reader, Constant_2x3<T>(0));
-    ExpectNext<T>(&reader, Constant_2x3<T>(1));
-    ExpectNext<T>(&reader, Constant_2x3<T>(2));
-    ExpectNext<T>(&reader, Constant_2x3<T>(3));
+    ExpectNext<T>(&reader, Constant_2x3(T(0)));
+    ExpectNext<T>(&reader, Constant_2x3(T(1)));
+    ExpectNext<T>(&reader, Constant_2x3(T(2)));
+    ExpectNext<T>(&reader, Constant_2x3(T(3)));
     EXPECT_TRUE(reader.Valid());
     reader.Next();
     EXPECT_FALSE(reader.Valid());
@@ -233,26 +233,26 @@ void TestBasic() {
         AllTensorKeys(&reader),
         std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003",
                              "foo_000", "foo_001", "foo_002", "foo_003"}));
-    Expect<T>(&reader, "bar_000", Constant_2x3<T>(0));
-    Expect<T>(&reader, "bar_001", Constant_2x3<T>(1));
-    Expect<T>(&reader, "bar_002", Constant_2x3<T>(2));
-    Expect<T>(&reader, "bar_003", Constant_2x3<T>(3));
-    Expect<T>(&reader, "foo_000", Constant_2x3<T>(0));
-    Expect<T>(&reader, "foo_001", Constant_2x3<T>(1));
-    Expect<T>(&reader, "foo_002", Constant_2x3<T>(2));
-    Expect<T>(&reader, "foo_003", Constant_2x3<T>(3));
+    Expect<T>(&reader, "bar_000", Constant_2x3(T(0)));
+    Expect<T>(&reader, "bar_001", Constant_2x3(T(1)));
+    Expect<T>(&reader, "bar_002", Constant_2x3(T(2)));
+    Expect<T>(&reader, "bar_003", Constant_2x3(T(3)));
+    Expect<T>(&reader, "foo_000", Constant_2x3(T(0)));
+    Expect<T>(&reader, "foo_001", Constant_2x3(T(1)));
+    Expect<T>(&reader, "foo_002", Constant_2x3(T(2)));
+    Expect<T>(&reader, "foo_003", Constant_2x3(T(3)));
   }
   {
     BundleReader reader(Env::Default(), Prefix("merged"));
     TF_ASSERT_OK(reader.status());
-    ExpectNext<T>(&reader, Constant_2x3<T>(0));
-    ExpectNext<T>(&reader, Constant_2x3<T>(1));
-    ExpectNext<T>(&reader, Constant_2x3<T>(2));
-    ExpectNext<T>(&reader, Constant_2x3<T>(3));
-    ExpectNext<T>(&reader, Constant_2x3<T>(0));
-    ExpectNext<T>(&reader, Constant_2x3<T>(1));
-    ExpectNext<T>(&reader, Constant_2x3<T>(2));
-    ExpectNext<T>(&reader, Constant_2x3<T>(3));
+    ExpectNext<T>(&reader, Constant_2x3(T(0)));
+    ExpectNext<T>(&reader, Constant_2x3(T(1)));
+    ExpectNext<T>(&reader, Constant_2x3(T(2)));
+    ExpectNext<T>(&reader, Constant_2x3(T(3)));
+    ExpectNext<T>(&reader, Constant_2x3(T(0)));
+    ExpectNext<T>(&reader, Constant_2x3(T(1)));
+    ExpectNext<T>(&reader, Constant_2x3(T(2)));
+    ExpectNext<T>(&reader, Constant_2x3(T(3)));
     EXPECT_TRUE(reader.Valid());
     reader.Next();
     EXPECT_FALSE(reader.Valid());
@@ -263,20 +263,20 @@ template <typename T>
 void TestNonStandardShapes() {
   {
     BundleWriter writer(Env::Default(), Prefix("nonstandard"));
-    TF_EXPECT_OK(writer.Add("scalar", Constant<T>(0, TensorShape())));
+    TF_EXPECT_OK(writer.Add("scalar", Constant(T(0), TensorShape())));
     TF_EXPECT_OK(
-        writer.Add("non_standard0", Constant<T>(0, TensorShape({0, 1618}))));
+        writer.Add("non_standard0", Constant(T(0), TensorShape({0, 1618}))));
     TF_EXPECT_OK(
-        writer.Add("non_standard1", Constant<T>(0, TensorShape({16, 0, 18}))));
+        writer.Add("non_standard1", Constant(T(0), TensorShape({16, 0, 18}))));
     TF_ASSERT_OK(writer.Finish());
   }
   {
     BundleReader reader(Env::Default(), Prefix("nonstandard"));
     TF_ASSERT_OK(reader.status());
-    Expect<T>(&reader, "scalar", Constant<T>(0, TensorShape()));
-    Expect<T>(&reader, "non_standard0", Constant<T>(0, TensorShape({0, 1618})));
+    Expect<T>(&reader, "scalar", Constant(T(0), TensorShape()));
+    Expect<T>(&reader, "non_standard0", Constant(T(0), TensorShape({0, 1618})));
     Expect<T>(&reader, "non_standard1",
-              Constant<T>(0, TensorShape({16, 0, 18})));
+              Constant(T(0), TensorShape({16, 0, 18})));
   }
 }
 
@@ -299,7 +299,7 @@ void VersionTest(const VersionDef& version, StringPiece expected_error) {
   BundleReader reader(Env::Default(), path);
   EXPECT_TRUE(errors::IsInvalidArgument(reader.status()));
   EXPECT_TRUE(
-      str_util::StartsWith(reader.status().error_message(), expected_error));
+      absl::StartsWith(reader.status().error_message(), expected_error));
 }
 
 }  // namespace
@@ -318,6 +318,7 @@ TEST(TensorBundleTest, Basic) {
   TestBasic<qint32>();
   TestBasic<quint8>();
   TestBasic<qint8>();
+  TestBasic<bfloat16>();
 }
 
 TEST(TensorBundleTest, PartitionedVariables) {
@@ -461,6 +462,7 @@ TEST(TensorBundleTest, NonStandardShapes) {
   TestNonStandardShapes<qint32>();
   TestNonStandardShapes<quint8>();
   TestNonStandardShapes<qint8>();
+  TestNonStandardShapes<bfloat16>();
 }
 
 TEST(TensorBundleTest, StringTensorsOldFormat) {
@@ -646,8 +648,7 @@ TEST(TensorBundleTest, Error) {
     BundleWriter writer(Env::Default(), Prefix("dup"));
     TF_EXPECT_OK(writer.Add("foo", Constant_2x3(1.f)));
     EXPECT_FALSE(writer.Add("foo", Constant_2x3(2.f)).ok());
-    EXPECT_TRUE(
-        str_util::StrContains(writer.status().ToString(), "duplicate key"));
+    EXPECT_TRUE(absl::StrContains(writer.status().ToString(), "duplicate key"));
     EXPECT_FALSE(writer.Finish().ok());
   }
   {  // Double finish
@@ -657,7 +658,7 @@ TEST(TensorBundleTest, Error) {
   }
   {  // Not found.
     BundleReader reader(Env::Default(), Prefix("nonexist"));
-    EXPECT_TRUE(str_util::StrContains(reader.status().ToString(), "Not found"));
+    EXPECT_TRUE(absl::StrContains(reader.status().ToString(), "Not found"));
   }
 }
 
@@ -688,7 +689,7 @@ TEST(TensorBundleTest, Checksum) {
     BundleReader reader(Env::Default(), Prefix(prefix));
     Status status = reader.Lookup(key, &val);
     EXPECT_TRUE(errors::IsDataLoss(status));
-    EXPECT_TRUE(str_util::StrContains(status.ToString(), expected_msg));
+    EXPECT_TRUE(absl::StrContains(status.ToString(), expected_msg));
   };
 
   // Corrupts a float tensor.
@@ -739,8 +740,8 @@ TEST(TensorBundleTest, Endianness) {
 
   BundleReader reader(Env::Default(), Prefix("end"));
   EXPECT_TRUE(errors::IsUnimplemented(reader.status()));
-  EXPECT_TRUE(str_util::StrContains(reader.status().ToString(),
-                                    "different endianness from the reader"));
+  EXPECT_TRUE(absl::StrContains(reader.status().ToString(),
+                                "different endianness from the reader"));
 }
 
 TEST(TensorBundleTest, TruncatedTensorContents) {
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index 3c9590e488d..fe617e8e30d 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -423,7 +423,7 @@ static void VersionTest(const VersionDef& versions, const string& error) {
   // Read it back in and verify that we get the expected error
   TensorSliceReader reader(path, OpenTableTensorSliceReader);
   EXPECT_TRUE(reader.status().code() == error::INVALID_ARGUMENT &&
-              str_util::StartsWith(reader.status().error_message(), error))
+              absl::StartsWith(reader.status().error_message(), error))
       << "Expected error starting with '" << errors::InvalidArgument(error)
       << "', got '" << reader.status() << "'";
 }
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index 31397f11b66..8fe6b358b3d 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -334,8 +334,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<int8> data(300000000, -1);
     Status s = writer.Add("test1", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(str_util::StrContains(
-        s.error_message(), "Tensor slice is too large to serialize"));
+    EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                  "Tensor slice is too large to serialize"));
   }
 
   // Add a large string tensor slice, which will fail.
@@ -345,8 +345,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<string> data(256 * 1024, std::string(8192, 'f'));
     Status s = writer.Add("test2", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(str_util::StrContains(
-        s.error_message(), "Tensor slice is too large to serialize"));
+    EXPECT_TRUE(absl::StrContains(s.error_message(),
+                                  "Tensor slice is too large to serialize"));
   }
 }
 
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index c119df6419e..144967a53ef 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -77,7 +77,7 @@ FP16ConvMode CudnnConvComputeMode() {
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
-  string lowercase_value = str_util::Lowercase(value);
+  string lowercase_value = absl::AsciiStrToLower(value);
   if (lowercase_value == "accurate") {
     return FP16ConvMode::kAccurate;
   } else if (lowercase_value == "fast") {
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index f65f3a5b933..2313dd2fa57 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -7,10 +7,13 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+    "tf_exec_compatible_with",
+)
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_tests_tags")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow:tensorflow.bzl", "tf_exec_compatible_with")
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 5f99f0a9c90..7585c2cad55 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -3,9 +3,10 @@
 
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/examples/get_started/regression/BUILD b/tensorflow/examples/get_started/regression/BUILD
index cdef25ce495..c04934b7d6d 100644
--- a/tensorflow/examples/get_started/regression/BUILD
+++ b/tensorflow/examples/get_started/regression/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/examples/how_tos/reading_data/BUILD b/tensorflow/examples/how_tos/reading_data/BUILD
index b8a6ee1026a..45f192d0b7d 100644
--- a/tensorflow/examples/how_tos/reading_data/BUILD
+++ b/tensorflow/examples/how_tos/reading_data/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # Example MNIST TensorFlow models for demonstrating data reading.
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index d98fe96f47a..249e256797d 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_binary(
diff --git a/tensorflow/examples/multibox_detector/BUILD b/tensorflow/examples/multibox_detector/BUILD
index 4f9908cd52d..f8c219316cb 100644
--- a/tensorflow/examples/multibox_detector/BUILD
+++ b/tensorflow/examples/multibox_detector/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
index 5ade3c2dbea..17a73d741d4 100644
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -50,6 +50,7 @@ cuda_py_test(
         "nomsan",  # forge input size exceeded
         "notsan",  # forge input size exceeded
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 # b/132234211: Target added to support internal test target that runs the test
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index f498f2a390d..23e3c5760f2 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -2,10 +2,9 @@ package(
     default_visibility = [
         "//visibility:public",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files([
     "LICENSE",
 ])
diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD
index e4383d155b0..99a9a9c840c 100644
--- a/tensorflow/examples/tutorials/layers/BUILD
+++ b/tensorflow/examples/tutorials/layers/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_binary(
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 6839c486144..5b7d3e71aef 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -1,7 +1,9 @@
 # Description:
 # Example TensorFlow models for MNIST used in tutorials
 
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/examples/tutorials/word2vec/BUILD b/tensorflow/examples/tutorials/word2vec/BUILD
index 5293f437dce..c2dd1acf8c2 100644
--- a/tensorflow/examples/tutorials/word2vec/BUILD
+++ b/tensorflow/examples/tutorials/word2vec/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # TensorFlow model for word2vec
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
index cc8835728d5..2303cfb1b39 100644
--- a/tensorflow/examples/wav_to_spectrogram/BUILD
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
diff --git a/tensorflow/go/doc.go b/tensorflow/go/doc.go
index a59652b160e..8d2a235ac7b 100644
--- a/tensorflow/go/doc.go
+++ b/tensorflow/go/doc.go
@@ -22,5 +22,5 @@ limitations under the License.
 // numerical computation using data flow graphs. This package provides
 // functionality to build and execute such graphs and depends on
 // TensorFlow being available. For installation instructions see
-// https://www.tensorflow.org/code/tensorflow/go/README.md
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/README.md
 package tensorflow
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 31f087591b7..1524741fee5 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -112,17 +112,9 @@ func (g *Graph) ImportWithOptions(def []byte, options GraphImportOptions) error
 	C.TF_ImportGraphDefOptionsSetPrefix(opts, cprefix)
 
 	if len(options.Device) != 0 {
-		// TODO(ashankar): Remove this error and uncomment below
-		// when a release of the C library which includes
-		// https://github.com/tensorflow/tensorflow/commit/e0af5ac53e5a8ad9b07cdd5738c0a8e12f938c4e
-		// has been made.
-		// See https://github.com/tensorflow/tensorflow/issues/23257
-		return fmt.Errorf("GraphImportOptions.Device is only supported with the TensorFlow C library versions after 1.12 (or built from master). See https://github.com/tensorflow/tensorflow/issues/23257")
-		/*
-			cdev := C.CString(options.Device)
-			defer C.free(unsafe.Pointer(cdev))
-			C.TF_ImportGraphDefOptionsSetDefaultDevice(opts, cdev)
-		*/
+		cdev := C.CString(options.Device)
+		defer C.free(unsafe.Pointer(cdev))
+		C.TF_ImportGraphDefOptionsSetDefaultDevice(opts, cdev)
 	}
 
 	buf := C.TF_NewBuffer()
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a346a94f64e..df6161f602a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -93,50 +93,47 @@ func Fingerprint(scope *Scope, data tf.Output, method tf.Output) (fingerprint tf
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
 // If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
 		m["num_bits"] = value
 	}
 }
 
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
 // If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
 		m["narrow_range"] = value
 	}
 }
 
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -145,9 +142,9 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			gradients, inputs,
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
@@ -155,12 +152,12 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
-// Subtracts sparse `updates` from an existing tensor according to `indices`.
+// Adds sparse `updates` to an existing tensor according to `indices`.
 //
-// This operation creates a new tensor by subtracting sparse `updates` from the
-// passed in `tensor`.
-// This operation is very similar to `tf.scatter_nd_sub`, except that the updates
-// are subtracted from an existing tensor (as opposed to a variable). If the memory
+// This operation creates a new tensor by adding sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd_add`, except that the updates
+// are added onto an existing tensor (as opposed to a variable). If the memory
 // for the existing tensor cannot be re-used, a copy is made and updated.
 //
 // `indices` is an integer tensor containing indices into a new tensor of shape
@@ -175,24 +172,24 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 //
 //     indices.shape[:-1] + shape[indices.shape[-1]:]
 //
-// The simplest form of tensor_scatter_sub is to subtract individual elements
-// from a tensor by index. For example, say we want to insert 4 scattered elements
-// in a rank-1 tensor with 8 elements.
+// The simplest form of tensor_scatter_add is to add individual elements to a
+// tensor by index. For example, say we want to add 4 elements in a rank-1
+// tensor with 8 elements.
 //
-// In Python, this scatter subtract operation would look like this:
+// In Python, this scatter add operation would look like this:
 //
 // ```python
 //     indices = tf.constant([[4], [3], [1], [7]])
 //     updates = tf.constant([9, 10, 11, 12])
 //     tensor = tf.ones([8], dtype=tf.int32)
-//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
 //     with tf.Session() as sess:
 //       print(sess.run(scatter))
 // ```
 //
 // The resulting tensor would look like this:
 //
-//     [1, -10, 1, -9, -8, 1, 1, -11]
+//     [1, 12, 1, 11, 10, 1, 1, 13]
 //
 // We can also, insert entire slices of a higher rank tensor all at once. For
 // example, if we wanted to insert two slices in the first dimension of a
@@ -207,16 +204,16 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 //                            [[5, 5, 5, 5], [6, 6, 6, 6],
 //                             [7, 7, 7, 7], [8, 8, 8, 8]]])
 //     tensor = tf.ones([4, 4, 4])
-//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
 //     with tf.Session() as sess:
 //       print(sess.run(scatter))
 // ```
 //
 // The resulting tensor would look like this:
 //
-//     [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
 //      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
-//      [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
 //      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
 //
 // Note that on CPU, if an out of bound index is found, an error is returned.
@@ -227,13 +224,13 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 //	indices: Index tensor.
 //	updates: Updates to scatter into output.
 //
-// Returns A new tensor copied from tensor and updates subtracted according to the indices.
-func TensorScatterSub(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+// Returns A new tensor copied from tensor and updates added according to the indices.
+func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorScatterSub",
+		Type: "TensorScatterAdd",
 		Input: []tf.Input{
 			tensor, indices, updates,
 		},
@@ -300,98 +297,46 @@ func LowerBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optiona
 	return op.Output(0)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// UpperBoundAttr is an optional argument to UpperBound.
+type UpperBoundAttr func(optionalAttr)
 
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+// UpperBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["out_type"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
+// Applies upper_bound(sorted_search_values, values) along each row.
 //
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='right')`.
 //
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
 //
-// ```
-// if T == qint8: in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
 //
-// *MIN_COMBINED Mode Example*
+//   result = UpperBound(sorted_sequence, values)
 //
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
-//
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
+//   result == [[1, 2, 4],
+//              [0, 2, 5]]
 //
 // Arguments:
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -400,9 +345,9 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "UpperBound",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			sorted_inputs, values,
 		},
 		Attrs: attrs,
 	}
@@ -410,145 +355,81 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
+type QuantizedInstanceNormAttr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+//
+// value: If True, `given_y_min` and `given_y_min`
+// and `given_y_max` are used as the output range. Otherwise,
+// the implementation computes the output range.
+// If not specified, defaults to false
+func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["output_range_given"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+//
+// value: Output in `y_min` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["given_y_min"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
+// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
 //
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
+// value: Output in `y_max` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_max"] = value
+	}
+}
+
+// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
 //
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+// value: A small float number to avoid dividing by 0.
+// If not specified, defaults to 1e-05
+func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["variance_epsilon"] = value
+	}
+}
+
+// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
 //
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8: out[i] -= (range(T) + 1) / 2.0
-// ```
-//
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-//
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-//
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-//
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-//
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
-//
-// Now we can quantize the elements of our tensor:
-//
-// ```c++
-// result = round(input * s)
-// ```
-//
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// value: Minimum value of `y_max - y_min`
+// If not specified, defaults to 0.001
+func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["min_separation"] = value
+	}
+}
+
+// Quantized Instance normalization.
 //
 // Arguments:
+//	x: A 4D input Tensor.
+//	x_min: The value represented by the lowest quantized input.
+//	x_max: The value represented by the highest quantized input.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
+func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "QuantizedInstanceNorm",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			x, x_min, x_max,
 		},
 		Attrs: attrs,
 	}
@@ -556,6 +437,58 @@ func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Reshapes a quantized tensor as per the Reshape op.
+//
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReshape",
+		Input: []tf.Input{
+			tensor, shape, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Concatenates quantized tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConcat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
 type QuantizeAndDequantizeV2Attr func(optionalAttr)
 
@@ -689,6 +622,73 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 	return op.Output(0)
 }
 
+// Bitcasts a tensor from one type to another without copying data.
+//
+// Given a tensor `input`, this operation returns a tensor that has the same buffer
+// data as `input` with datatype `type`.
+//
+// If the input datatype `T` is larger than the output datatype `type` then the
+// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+//
+// If `T` is smaller than `type`, the operator requires that the rightmost
+// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+// [..., sizeof(`type`)/sizeof(`T`)] to [...].
+//
+// tf.bitcast() and tf.cast() work differently when real dtype is casted as a complex dtype
+// (e.g. tf.complex64 or tf.complex128) as tf.cast() make imaginary part 0 while tf.bitcast()
+// gives module error.
+// For example,
+//
+// Example 1:
+// ```python
+// >>> a = [1., 2., 3.]
+// >>> equality_bitcast = tf.bitcast(a,tf.complex128)
+// tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot bitcast from float to complex128: shape [3] [Op:Bitcast]
+// >>> equality_cast = tf.cast(a,tf.complex128)
+// >>> print(equality_cast)
+// tf.Tensor([1.+0.j 2.+0.j 3.+0.j], shape=(3,), dtype=complex128)
+// ```
+// Example 2:
+// ```python
+// >>> tf.bitcast(tf.constant(0xffffffff, dtype=tf.uint32), tf.uint8)
+// <tf.Tensor: ... shape=(4,), dtype=uint8, numpy=array([255, 255, 255, 255], dtype=uint8)>
+// ```
+// Example 3:
+// ```python
+// >>> x = [1., 2., 3.]
+// >>> y = [0., 2., 3.]
+// >>> equality= tf.equal(x,y)
+// >>> equality_cast = tf.cast(equality,tf.float32)
+// >>> equality_bitcast = tf.bitcast(equality_cast,tf.uint8)
+// >>> print(equality)
+// tf.Tensor([False True True], shape=(3,), dtype=bool)
+// >>> print(equality_cast)
+// tf.Tensor([0. 1. 1.], shape=(3,), dtype=float32)
+// >>> print(equality_bitcast)
+// tf.Tensor(
+// [[ 0 0 0 0]
+//  [ 0 0 128 63]
+//  [ 0 0 128 63]], shape=(3, 4), dtype=uint8)
+// ```
+//
+// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+// endian orderings will give different results.
+func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"type": type_}
+	opspec := tf.OpSpec{
+		Type: "Bitcast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Extract `patches` from `input` and put them in the "depth" output dimension. 3D extension of `extract_image_patches`.
 //
 // Arguments:
@@ -726,66 +726,206 @@ func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides
 	return op.Output(0)
 }
 
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
+// Extract `patches` from `images` and put them in the "depth" output dimension.
 //
 // Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
 //
-// This operation is equivalent to the following steps:
+// We specify the size-related attributes as:
 //
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
+// ```python
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
+// ```
 //
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "ExtractImagePatches",
+		Input: []tf.Input{
+			images,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthToSpaceAttr is an optional argument to DepthToSpace.
+type DepthToSpaceAttr func(optionalAttr)
+
+// DepthToSpaceDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthToSpace for tensors of type T.
 //
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
 //
-//       input_shape[M+1], ..., input_shape[N-1]]
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The Y, X coordinates within each block of the output image are determined
+//     by the high order component of the input channel index.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
 //
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
 //
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channels).
+//      The output would be the input transposed to the following layout:
+//      n,iY,bY,iX,bX,oC
 //
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
 //
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
+// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+// block_size = 2:
 //
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+// ```
+// x = [[[[1, 2, 3, 4]]]]
 //
-//       input_shape[M+1], ..., input_shape[N-1]]
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[[ [1],   [2],  [5],  [6]],
+//       [ [3],   [4],  [7],  [8]],
+//       [ [9],  [10], [13],  [14]],
+//       [ [11], [12], [15],  [16]]]]
+//
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthToSpace",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchToSpace for 4-D tensors of type T.
+//
+// This is a legacy version of the more general BatchToSpaceND.
+//
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
+//
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
+//
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+//
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
 //
 // Some examples:
 //
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
 //
 // ```
 // [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
@@ -797,8 +937,7 @@ func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides
 // x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
 //
 // ```
 // [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
@@ -811,8 +950,7 @@ func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
 //
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
 //
 // ```
 // x = [[[[1], [3]], [[9], [11]]],
@@ -830,32 +968,259 @@ func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides
 //      [[13], [14], [15],  [16]]]]
 // ```
 //
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
 //
 // ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 // ```
 //
 // The output tensor has shape `[2, 2, 4, 1]` and value:
 //
 // ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "BatchToSpace",
+		Input: []tf.Input{
+			input, crops,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SpaceToBatch for 4-D tensors of type T.
+//
+// This is a legacy version of the more general SpaceToBatchND.
+//
+// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
+// More specifically, this op outputs a copy of the input tensor where values from
+// the `height` and `width` dimensions are moved to the `batch` dimension. After
+// the zero-padding, both `height` and `width` of the input must be divisible by the
+// block size.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, depth]`.
+//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+//   the padding of the input with zeros across the spatial dimensions as follows:
+//
+//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
+//
+//   The effective spatial dimensions of the zero-padded input tensor will be:
+//
+//       height_pad = pad_top + height + pad_bottom
+//       width_pad = pad_left + width + pad_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` in the height and
+//     width dimensions are rearranged into the batch dimension at each location.
+//   * The batch of the output tensor is `batch * block_size * block_size`.
+//   * Both height_pad and width_pad must be divisible by block_size.
+//
+// The shape of the output will be:
+//
+//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//      depth]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
+//
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]]],
 //      [[[9],  [10], [11],  [12]],
 //       [[13], [14], [15],  [16]]]]
 // ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+//
+// The output tensor has shape `[8, 1, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+//
+func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	opspec := tf.OpSpec{
+		Type: "SpaceToBatch",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
+
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Computes the difference between two lists of numbers or strings.
+//
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
+//
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
+//
+// For example, given this input:
+//
+// ```
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
+// ```
+//
+// This operation would return:
+//
+// ```
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
+// ```
+//
+// Arguments:
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
+//
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ListDiff",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Inserts a dimension of 1 into a tensor's shape.
+//
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
+// zero; if you specify a negative number for `axis` it is counted backward from
+// the end.
+//
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
+//
+// Other examples:
+//
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
+//
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
+//
+// This operation requires that:
+//
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
+//
+// Arguments:
+//
+//	axis: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`. Must be in the range
+// `[-rank(input) - 1, rank(input)]`.
+//
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "ExpandDims",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			input, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -890,47 +1255,6 @@ func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.O
 	return op.Output(0)
 }
 
-// PlaceholderAttr is an optional argument to Placeholder.
-type PlaceholderAttr func(optionalAttr)
-
-// PlaceholderShape sets the optional shape attribute to value.
-//
-// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
-// shape is unconstrained.
-// If not specified, defaults to <unknown_rank:true >
-func PlaceholderShape(value tf.Shape) PlaceholderAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Placeholder",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
 //
 // This operation folds the padded areas of `input` by `MirrorPad` according to the
@@ -975,6 +1299,46 @@ func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode strin
 	return op.Output(0)
 }
 
+// Pads a tensor.
+//
+// This operation pads `input` according to the `paddings` and `constant_values`
+// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many padding values to add before the contents of `input` in that dimension,
+// and `paddings[D, 1]` indicates how many padding values to add after the contents
+// of `input` in that dimension. `constant_values` is a scalar tensor of the same
+// type as `input` that indicates the value to use for padding `input`.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # 'constant_values' is 0
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "PadV2",
+		Input: []tf.Input{
+			input, paddings, constant_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Pads a tensor with zeros.
 //
 // This operation pads a `input` with zeros according to the `paddings` you
@@ -1014,41 +1378,6 @@ func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
-//
-// This is typically used by gradient computations for a broadcasting operation.
-func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastGradientArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns locations of nonzero / true values in a tensor.
 //
 // This operation returns the coordinates of true elements in `condition`. The
@@ -1124,6 +1453,52 @@ func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	return op.Output(0)
 }
 
+// Returns the gradient of `Tile`.
+//
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+//
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TileGrad",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Constructs a tensor by tiling a given tensor.
+//
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
+//
+// Arguments:
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tile",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // TensorStridedSliceUpdateAttr is an optional argument to TensorStridedSliceUpdate.
 type TensorStridedSliceUpdateAttr func(optionalAttr)
 
@@ -1337,6 +1712,199 @@ func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Out
 	return op.Output(0)
 }
 
+// StridedSliceAttr is an optional argument to StridedSlice.
+type StridedSliceAttr func(optionalAttr)
+
+// StridedSliceBeginMask sets the optional begin_mask attribute to value.
+//
+// value: a bitmask where a bit i being 1 means to ignore the begin
+// value and instead use the largest interval possible. At runtime
+// begin[i] will be replaced with `[0, n-1)` if `stride[i] > 0` or
+// `[-1, n-1]` if `stride[i] < 0`
+// If not specified, defaults to 0
+func StridedSliceBeginMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceEndMask sets the optional end_mask attribute to value.
+//
+// value: analogous to `begin_mask`
+// If not specified, defaults to 0
+func StridedSliceEndMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// position is actually an ellipsis. One bit at most can be 1.
+// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
+// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
+// implicitly creates as many range specifications as necessary to fully
+// specify the sliced range for every dimension. For example for a 4-dimensional
+// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
+// If not specified, defaults to 0
+func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// specification creates a new shape 1 dimension. For example
+// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+// If not specified, defaults to 0
+func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` implies that the `i`th
+// specification should shrink the dimensionality. begin and end
+// must imply a slice of size 1 in the dimension. For example in
+// python one might do `foo[:, 3, :]` which would result in
+// `shrink_axis_mask` being 2.
+// If not specified, defaults to 0
+func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Return a strided slice from `input`.
+//
+// Note, most python users will want to use the Python `Tensor.__getitem__`
+// or `Variable.__getitem__` rather than this op directly.
+//
+// The goal of this op is to produce a new tensor with a subset of
+// the elements from the `n` dimensional `input` tensor. The subset is chosen using
+// a sequence of `m` sparse range specifications encoded into the arguments
+// of this function. Note, in some cases
+// `m` could be equal to `n`, but this need not be the case. Each
+// range specification entry can be one of the following:
+//
+// - An ellipsis (...). Ellipses are used to imply zero or more
+//   dimensions of full-dimension selection and are produced using
+//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
+//
+// - A new axis. This is used to insert a new shape=1 dimension and is
+//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
+//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
+//
+//
+// - A range `begin:end:stride`. This is used to specify how much to choose from
+//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+//   which represents the index of the first value to select while `end` represents
+//   the index of the last value to select. The number of values selected in each
+//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
+//   the second to last. `begin_mask` controls whether to replace the explicitly
+//   given `begin` with an implicit effective value of `0` if `stride > 0` and
+//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+//   required to create the largest open interval. For example, given a shape
+//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+//   first dimension of a tensor while dropping the last two (in the original
+//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
+//
+// - A single index. This is used to keep only elements that have a given
+//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
+//   `shrink_axis_mask`.
+//
+// Each conceptual range specification is encoded in the op's argument. This
+// encoding is best understand by considering a non-trivial example. In
+// particular,
+// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+//
+// ```
+// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+// end = [2, 4, x, x, -3, x]
+// strides = [1, 1, x, x, -1, 1]
+// begin_mask = 1<<4 | 1 << 5 = 48
+// end_mask = 1<<5 = 32
+// ellipsis_mask = 1<<3 = 8
+// new_axis_mask = 1<<2 4
+// shrink_axis_mask = 1<<0
+// ```
+//
+// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+// the slice becomes (2, 1, 5, 5, 2, 5).
+// Let us walk step by step through each argument specification.
+//
+// 1.  The first argument in the example slice is turned into `begin = 1` and
+// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+// also set the appropriate bit in `shrink_axis_mask`.
+//
+// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+// zero bits contributed.
+//
+// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+// dimension in the final shape. Dummy values are contributed to begin,
+// end and stride, while the new_axis_mask bit is set.
+//
+// 4. `...` grab the full ranges from as many dimensions as needed to
+// fully specify a slice for every dimension of the input shape.
+//
+// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+// with a dimension that has shape `s` is converted to a positive index
+// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+// is done internally so begin, end and strides receive x, -3, and -1.
+// The appropriate begin_mask bit is set to indicate the start range is the
+// full range (ignoring the x).
+//
+// 6. `:` indicates that the entire contents of the corresponding dimension
+// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+// `end_mask` are also set.
+//
+// *Requirements*:
+//   `0 != strides[i] for i in [0, m)`
+//   `ellipsis_mask must be a power of two (only one ellipsis)`
+//
+// Arguments:
+//
+//	begin: `begin[k]` specifies the offset into the `k`th range specification.
+// The exact dimension this corresponds to will be determined by context.
+// Out-of-bounds values will be silently clamped. If the `k`th bit of
+// `begin_mask` then `begin[k]` is ignored and the full range of the
+// appropriate dimension is used instead. Negative values causes indexing
+// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
+//	end: `end[i]` is like `begin` with the exception that `end_mask` is
+// used to determine full ranges.
+//	strides: `strides[i]` specifies the increment in the `i`th specification
+// after extracting a given element. Negative indices will reverse
+// the original order. Out or range values are
+// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
+func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StridedSlice",
+		Input: []tf.Input{
+			input, begin, end, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the rank of a tensor.
 //
 // This operation returns an integer representing the rank of `input`.
@@ -1366,54 +1934,21 @@ func Rank(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Ensures that the tensor's shape matches the expected shape.
-//
-// Raises an error if the input tensor's shape does not match the specified shape.
-// Returns the input tensor otherwise.
-//
-// Arguments:
-//	input: A tensor, whose shape is to be validated.
-//	shape: The expected (possibly partially specified) shape of the input tensor.
-//
-// Returns A tensor with the same shape and contents as the input tensor or value.
-func EnsureShape(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "EnsureShape",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
-
-// ShapeOutType sets the optional out_type attribute to value.
+// ShapeNOutType sets the optional out_type attribute to value.
 // If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// Returns the shape of a tensor.
+// Returns shape of tensors.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1422,101 +1957,23 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "ShapeN",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(input),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueWithCountsV2Attr is an optional argument to UniqueWithCountsV2.
-type UniqueWithCountsV2Attr func(optionalAttr)
-
-// UniqueWithCountsV2OutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsV2OutIdx(value tf.DataType) UniqueWithCountsV2Attr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements along an axis of a tensor.
-//
-// This operation either returns a tensor `y` containing unique elements
-// along the `axis` of a tensor. The returned unique elements is sorted
-// in the same order as they occur along `axis` in `x`.
-// This operation also returns a tensor `idx` and a tensor `count`
-// that are the same size as the number of the elements in `x` along the
-// `axis` dimension. The `idx` contains the index in the unique output `y`
-// and the `count` contains the count in the unique output `y`.
-// In other words, for an `1-D` tensor `x` with `axis = None:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
-//
-// For an `2-D` tensor `x` with `axis = 0`:
-//
-// ```
-// # tensor 'x' is [[1, 0, 0],
-// #                [1, 0, 0],
-// #                [2, 0, 0]]
-// y, idx, count = unique_with_counts(x, axis=0)
-// y ==> [[1, 0, 0],
-//        [2, 0, 0]]
-// idx ==> [0, 0, 1]
-// count ==> [2, 1]
-// ```
-//
-// For an `2-D` tensor `x` with `axis = 1`:
-//
-// ```
-// # tensor 'x' is [[1, 0, 0],
-// #                [1, 0, 0],
-// #                [2, 0, 0]]
-// y, idx, count = unique_with_counts(x, axis=1)
-// y ==> [[1, 0],
-//        [1, 0],
-//        [2, 0]]
-// idx ==> [0, 1, 1]
-// count ==> [1, 2]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`.
-//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
-// find the unique elements.
-//
-// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
-// value of x in the output y.A 1-D Tensor. The count of each value of x in the output y.
-func UniqueWithCountsV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueWithCountsV2Attr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
 	}
-	opspec := tf.OpSpec{
-		Type: "UniqueWithCountsV2",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return output
 }
 
 // UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
@@ -1656,6 +2113,152 @@ func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Att
 	return op.Output(0), op.Output(1)
 }
 
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
+
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8: out[i] -= (range(T) + 1) / 2.0
+// ```
+//
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+//
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+//
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+//
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+//
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+//
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeV2",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // UniqueAttr is an optional argument to Unique.
 type UniqueAttr func(optionalAttr)
 
@@ -1708,106 +2311,93 @@ func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx
 	return op.Output(0), op.Output(1)
 }
 
-// Shuffle dimensions of x according to a permutation and conjugate the result.
+// Reshapes a tensor.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
-func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConjugateTranspose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Shuffle dimensions of x according to a permutation.
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the inverse permutation of a tensor.
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
 //
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
 //
 // For example:
 //
 // ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
+//
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
+//
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+//
+// # -1 can also be used to infer the shape
+//
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
 // ```
 //
 // Arguments:
-//	x: 1-D.
 //
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "Reshape",
 		Input: []tf.Input{
-			x,
+			tensor, shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Stops gradient computation.
+// Identity op for gradient debugging.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
-//
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
-//
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "DebugGradientIdentity",
 		Input: []tf.Input{
 			input,
 		},
@@ -1816,219 +2406,142 @@ func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
-//
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
-//
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+// Returns a copy of the input tensor.
+func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityN",
+		Type: "Snapshot",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Gather slices from `params` axis `axis` according to `indices`.
+// Gather slices from `params` into a Tensor with shape specified by `indices`.
 //
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
+// `indices` is an K-dimensional integer tensor, best thought of as a
+// (K-1)-dimensional tensor of indices into `params`, where each element defines a
+// slice of `params`:
 //
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+//     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
 //
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+// Whereas in `tf.gather` `indices` defines slices into the first
+// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
 //
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
+// The last dimension of `indices` can be at most the rank of
+// `params`:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
+//     indices.shape[-1] <= params.rank
+//
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] == params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
+//
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 //
 // Note that on CPU, if an out of bound index is found, an error is returned.
 // On GPU, if an out of bound index is found, a 0 is stored in the
 // corresponding output value.
 //
-// See also `tf.batch_gather` and `tf.gather_nd`.
+// Some examples below.
 //
-// Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
-//
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GatherV2",
-		Input: []tf.Input{
-			params, indices, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
-
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from `params` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// Simple indexing into a matrix:
 //
 // ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+//     indices = [[0, 0], [1, 1]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = ['a', 'd']
 // ```
 //
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
-//
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Gather",
-		Input: []tf.Input{
-			params, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
+// Slice indexing into a matrix:
 //
+// ```python
+//     indices = [[1], [0]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['c', 'd'], ['a', 'b']]
 // ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
 //
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
+// Indexing into a 3-tensor:
 //
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// ```python
+//     indices = [[1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['a1', 'b1'], ['c1', 'd1']]]
 //
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
+//
+//     indices = [[0, 1], [1, 0]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['c0', 'd0'], ['a1', 'b1']]
+//
+//
+//     indices = [[0, 0, 1], [1, 0, 1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = ['b0', 'b1']
 // ```
 //
+// Batched indexing into a matrix:
+//
+// ```python
+//     indices = [[[0, 0]], [[0, 1]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['a'], ['b']]
+// ```
+//
+// Batched slice indexing into a matrix:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [[['c', 'd']], [['a', 'b']]]
+// ```
+//
+// Batched indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
+//               [[['a0', 'b0'], ['c0', 'd0']]]]
+//
+//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['c0', 'd0'], ['a1', 'b1']],
+//               [['a0', 'b0'], ['c1', 'd1']]]
+//
+//
+//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['b0', 'b1'], ['d0', 'c1']]
+// ```
+//
+// See also `tf.gather` and `tf.batch_gather`.
+//
 // Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
 //
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseV2",
+		Type: "GatherNd",
 		Input: []tf.Input{
-			tensor, axis,
+			params, indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -2099,34 +2612,50 @@ func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_uppe
 	return op.Output(0)
 }
 
-// Returns a batched matrix tensor with new batched diagonal values.
+// Returns the batched diagonal part of a batched tensor.
 //
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
+// This operation returns a tensor with the `diagonal` part
+// of the batched `input`. The `diagonal` part is computed as follows:
 //
-// The output is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
 //
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
 //
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+// The input must be at least a matrix.
+//
+// For example:
+//
+// ```
+// # 'input' is [[[1, 0, 0, 0]
+//                [0, 2, 0, 0]
+//                [0, 0, 3, 0]
+//                [0, 0, 0, 4]],
+//               [[5, 0, 0, 0]
+//                [0, 6, 0, 0]
+//                [0, 0, 7, 0]
+//                [0, 0, 0, 8]]]
+//
+// and input.shape = (2, 4, 4)
+//
+// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// which has shape (2, 4)
+// ```
 //
 // Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
+//	input: Rank `k` tensor where `k >= 2`.
 //
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+// Returns The extracted diagonal(s) having shape
+// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
+		Type: "MatrixDiagPart",
 		Input: []tf.Input{
-			input, diagonal,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -2180,6 +2709,45 @@ func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Returns the diagonal part of the tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
+//
+// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+//
+// For example:
+//
+// ```
+// # 'input' is [[1, 0, 0, 0]
+//               [0, 2, 0, 0]
+//               [0, 0, 3, 0]
+//               [0, 0, 0, 4]]
+//
+// tf.diag_part(input) ==> [1, 2, 3, 4]
+// ```
+//
+// Arguments:
+//	input: Rank k tensor where k is even and not zero.
+//
+// Returns The extracted diagonal.
+func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DiagPart",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns a diagonal tensor with a given diagonal values.
 //
 // Given a `diagonal`, this operation returns a tensor with the `diagonal` and
@@ -2216,26 +2784,6 @@ func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
-//
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OnesLike",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns a tensor of zeros with the same shape and type as x.
 //
 // Arguments:
@@ -2256,26 +2804,41 @@ func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Gives a guarantee to the TF runtime that the input tensor is a constant.
+// Splits a tensor into `num_split` tensors along one dimension.
 //
-// The runtime is then free to make optimizations based on this.
+// Arguments:
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//	value: The tensor to split.
+//	num_split: The number of ways to split.  Must evenly divide
+// `value.shape[split_dim]`.
 //
-// Only accepts value typed tensors as inputs and rejects resource variable handles
-// as input.
-//
-// Returns the input tensor without modification.
-func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns They are identically shaped tensors, whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `values.shape[split_dim] / num_split`.
+func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "GuaranteeConst",
+		Type: "Split",
 		Input: []tf.Input{
-			input,
+			axis, value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Split", err)
+		return
+	}
+	return output
 }
 
 // Computes offsets of concat inputs within its output.
@@ -2345,6 +2908,31 @@ func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Outpu
 	return op.Output(0)
 }
 
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Concat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Broadcast an array for a compatible shape.
 //
 // Broadcasting is the process of making arrays to have compatible shapes
@@ -2427,30 +3015,6 @@ func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAt
 	return op.Output(0)
 }
 
-//     Subtracts `v` into specified rows of `x`.
-//
-//     Computes y = x; y[i, :] -= v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceSub",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 //     Adds v into specified rows of x.
 //
 //     Computes y = x; y[i, :] += v; return y.
@@ -2475,86 +3039,6 @@ func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Outpu
 	return op.Output(0)
 }
 
-// Makes a copy of `x`.
-//
-// Arguments:
-//	x: The source tensor of type `T`.
-//
-// Returns     y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
-//       is not an alias of `x`.
-func DeepCopy(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeepCopy",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PackAttr is an optional argument to Pack.
-type PackAttr func(optionalAttr)
-
-// PackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to pack.  Negative values wrap around, so the
-// valid range is `[-(R+1), R+1)`.
-// If not specified, defaults to 0
-func PackAxis(value int64) PackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-//
-// Packs the `N` tensors in `values` into a tensor with rank one higher than each
-// tensor in `values`, by packing them along the `axis` dimension.
-// Given a list of tensors of shape `(A, B, C)`;
-//
-// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-// Etc.
-//
-// For example:
-//
-// ```
-// # 'x' is [1, 4]
-// # 'y' is [2, 5]
-// # 'z' is [3, 6]
-// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-// ```
-//
-// This is the opposite of `unpack`.
-//
-// Arguments:
-//	values: Must be of same shape and type.
-//
-// Returns The packed tensor.
-func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Pack",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MfccAttr is an optional argument to Mfcc.
 type MfccAttr func(optionalAttr)
 
@@ -2700,81 +3184,80 @@ func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride i
 	return op.Output(0)
 }
 
-// Splits a tensor into `num_split` tensors along one dimension.
+// Encode audio data using the WAV file format.
+//
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
+//
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	value: The tensor to split.
-//	size_splits: list containing the sizes of each output tensor along the split
-// dimension. Must sum to the dimension of value along split_dim.
-// Can contain one -1 indicating that dimension is to be inferred.
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-//
-// Returns Tensors whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `size_splits[i]`.
-func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "SplitV",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			value, size_splits, axis,
+			audio, sample_rate,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("SplitV", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// UnbatchGradAttr is an optional argument to UnbatchGrad.
-type UnbatchGradAttr func(optionalAttr)
+// DecodeWavAttr is an optional argument to DecodeWav.
+type DecodeWavAttr func(optionalAttr)
 
-// UnbatchGradContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradContainer(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnbatchGradSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradSharedName(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Gradient of Unbatch.
+// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
 //
-// Acts like Batch but using the given batch_index index of batching things as they
-// become available. This ensures that the gradients are propagated back in the
-// same session which did the forward pass.
+// value: Number of sample channels wanted.
+// If not specified, defaults to -1
+func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_channels"] = value
+	}
+}
+
+// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
 //
-// original_input: The input to the Unbatch operation this is the gradient of.
-// batch_index: The batch_index given to the Unbatch operation this is the gradient
-// of.
-// grad: The downstream gradient.
-// id: The id scalar emitted by Batch.
-// batched_grad: The return value, either an empty tensor or the batched gradient.
-// container: Container to control resource sharing.
-// shared_name: Instances of UnbatchGrad with the same container and shared_name
-//  are assumed to possibly belong to the same batch. If left empty, the op name
-//  will be used as the shared name.
-func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+// value: Length of audio requested.
+// If not specified, defaults to -1
+func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_samples"] = value
+	}
+}
+
+// Decode a 16-bit PCM WAV file to a float tensor.
+//
+// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+//
+// When desired_channels is set, if the input contains fewer channels than this
+// then the last channel will be duplicated to give the requested number, else if
+// the input has more channels than requested then the additional channels will be
+// ignored.
+//
+// If desired_samples is set, then the audio will be cropped or padded with zeroes
+// to the requested length.
+//
+// The first output contains a Tensor with the content of the audio samples. The
+// lowest dimension will be the number of channels, and the second will be the
+// number of samples. For example, a ten-sample-long stereo WAV file should give an
+// output shape of [10, 2].
+//
+// Arguments:
+//	contents: The WAV-encoded audio, usually from a file.
+//
+// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
+func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2783,183 +3266,14 @@ func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UnbatchGrad",
+		Type: "DecodeWav",
 		Input: []tf.Input{
-			original_input, batch_index, grad, id,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnbatchAttr is an optional argument to Unbatch.
-type UnbatchAttr func(optionalAttr)
-
-// UnbatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchContainer(value string) UnbatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnbatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnbatchSharedName(value string) UnbatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Reverses the operation of Batch for a single output Tensor.
-//
-// An instance of Unbatch either receives an empty batched_tensor, in which case it
-// asynchronously waits until the values become available from a concurrently
-// running instance of Unbatch with the same container and shared_name, or receives
-// a non-empty batched_tensor in which case it finalizes all other concurrently
-// running instances and outputs its own element from the batch.
-//
-// batched_tensor: The possibly transformed output of Batch. The size of the first
-//  dimension should remain unchanged by the transformations for the operation to
-//  work.
-// batch_index: The matching batch_index obtained from Batch.
-// id: The id scalar emitted by Batch.
-// unbatched_tensor: The Tensor corresponding to this execution.
-// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
-//  batched input tensor associated with a given invocation of the op.
-// container: Container to control resource sharing.
-// shared_name: Instances of Unbatch with the same container and shared_name are
-//  assumed to possibly belong to the same batch. If left empty, the op name will
-//  be used as the shared name.
-func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unbatch",
-		Input: []tf.Input{
-			batched_tensor, batch_index, id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchAttr is an optional argument to Batch.
-type BatchAttr func(optionalAttr)
-
-// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
-// If not specified, defaults to 10
-func BatchMaxEnqueuedBatches(value int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["max_enqueued_batches"] = value
-	}
-}
-
-// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to <>
-func BatchAllowedBatchSizes(value []int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["allowed_batch_sizes"] = value
-	}
-}
-
-// BatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BatchContainer(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// BatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BatchSharedName(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// BatchBatchingQueue sets the optional batching_queue attribute to value.
-// If not specified, defaults to ""
-func BatchBatchingQueue(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["batching_queue"] = value
-	}
-}
-
-// Batches all input tensors nondeterministically.
-//
-// When many instances of this Op are being run concurrently with the same
-// container/shared_name in the same device, some will output zero-shaped Tensors
-// and others will output Tensors of size up to max_batch_size.
-//
-// All Tensors in in_tensors are batched together (so, for example, labels and
-// features should be batched with a single instance of this operation.
-//
-// Each invocation of batch emits an `id` scalar which will be used to identify
-// this particular invocation when doing unbatch or its gradient.
-//
-// Each op which emits a non-empty batch will also emit a non-empty batch_index
-// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-// start, and length of elements of each set of Tensors present in batched_tensors.
-//
-// Batched tensors are concatenated along the first dimension, and all tensors in
-// in_tensors must have the first dimension of the same size.
-//
-// in_tensors: The tensors to be batched.
-// num_batch_threads: Number of scheduling threads for processing batches of work.
-//  Determines the number of batches processed in parallel.
-// max_batch_size: Batch sizes will never be bigger than this.
-// batch_timeout_micros: Maximum number of microseconds to wait before outputting
-//  an incomplete batch.
-// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
-//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-//  batches up to one of those sizes. The entries must increase monotonically, and
-//  the final entry must equal max_batch_size.
-// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-// batch_index: If out_tensors is non-empty, has information to invert it.
-// container: Controls the scope of sharing of this batch.
-// id: always contains a scalar with a unique ID for this invocation of Batch.
-// shared_name: Concurrently running instances of batch in the same device with the
-//  same container and shared_name will batch their elements together. If left
-//  empty, the op name will be used as the shared name.
-// T: the types of tensors to be batched.
-func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Batch",
-		Input: []tf.Input{
-			tf.OutputList(in_tensors),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
-		scope.UpdateErr("Batch", err)
-		return
-	}
-	batch_index = op.Output(idx)
-	id = op.Output(idx)
-	return batched_tensors, batch_index, id
+	return op.Output(0), op.Output(1)
 }
 
 // Elementwise computes the bitwise right-shift of `x` and `y`.
@@ -3001,6 +3315,64 @@ func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Elementwise computes the bitwise OR of `x` and `y`.
+//
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+//
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
+//
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "PopulationCount",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Flips all bits elementwise.
 //
 // The result will have exactly those bits set, that are not set in `x`. The
@@ -3019,27 +3391,26 @@ func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Generate the bucket boundaries for each feature based on accumulated summaries.
+// Bucketize each feature based on bucket boundaries.
 //
-// An op that returns a list of float tensors for a quantile stream resource. Each
-// tensor is Rank 1 containing bucket boundaries for a single feature.
+// An op that returns a list of float tensors, where each tensor represents the
+// bucketized values for a single feature.
 //
 // Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_features: inferred int; number of features to get bucket boundaries for.
+//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
+// feature.
 //
-// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
-func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
+// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
+func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_features": num_features}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
+		Type: "BoostedTreesBucketize",
 		Input: []tf.Input{
-			quantile_stream_resource_handle,
+			tf.OutputList(float_values), tf.OutputList(bucket_boundaries),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
@@ -3047,11 +3418,125 @@ func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantil
 	}
 	var idx int
 	var err error
-	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
-		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
+	if buckets, idx, err = makeOutputList(op, idx, "buckets"); err != nil {
+		scope.UpdateErr("BoostedTreesBucketize", err)
 		return
 	}
-	return bucket_boundaries
+	return buckets
+}
+
+// BoostedTreesQuantileStreamResourceFlushAttr is an optional argument to BoostedTreesQuantileStreamResourceFlush.
+type BoostedTreesQuantileStreamResourceFlushAttr func(optionalAttr)
+
+// BoostedTreesQuantileStreamResourceFlushGenerateQuantiles sets the optional generate_quantiles attribute to value.
+//
+// value: bool; If True, the output will be the num_quantiles for each stream where the ith
+// entry is the ith quantile of the input with an approximation error of epsilon.
+// Duplicate values may be present.
+// If False, the output will be the points in the histogram that we got which roughly
+// translates to 1/epsilon boundaries and without any duplicates.
+// Default to False.
+// If not specified, defaults to false
+func BoostedTreesQuantileStreamResourceFlushGenerateQuantiles(value bool) BoostedTreesQuantileStreamResourceFlushAttr {
+	return func(m optionalAttr) {
+		m["generate_quantiles"] = value
+	}
+}
+
+// Flush the summaries for a quantile stream resource.
+//
+// An op that flushes the summaries for a quantile stream resource.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_buckets: int; approximate number of buckets unless using generate_quantiles.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resource_handle tf.Output, num_buckets tf.Output, optional ...BoostedTreesQuantileStreamResourceFlushAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceFlush",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, num_buckets,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reverses specific dimensions of a tensor.
+//
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
+//
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
+//
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseV2",
+		Input: []tf.Input{
+			tensor, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
@@ -3137,105 +3622,6 @@ func IsBoostedTreesQuantileStreamResourceInitialized(scope *Scope, quantile_stre
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
-//
-// and `max` to 'outputs' tensor of same shape as `inputs`.
-//
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-//
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
-		Input: []tf.Input{
-			inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Updates the tree ensemble by either adding a layer to the last tree being grown
-//
-// or by starting a new tree.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the ensemble variable.
-//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
-// the feature that will be used in the split.
-//	node_ids: List of rank 1 tensors representing the nodes for which this feature
-// has a split.
-//	gains: List of rank 1 tensors representing the gains for each of the feature's
-// split.
-//	thresholds: List of rank 1 tensors representing the thesholds for each of the
-// feature's split.
-//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
-// the feature's splits. Will be added to the previous node values to constitute
-// the values of the left nodes.
-//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
-// of the feature's splits. Will be added to the previous node values to constitute
-// the values of the right nodes.
-//	max_depth: Max depth of the tree to build.
-//	learning_rate: shrinkage const for each new tree.
-//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
-//
-// Returns the created operation.
-func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesUpdateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Debugging/model interpretability outputs for each example.
 //
 // It traverses all the trees and computes debug metrics for individual examples,
@@ -3266,65 +3652,6 @@ func BoostedTreesExampleDebugOutputs(scope *Scope, tree_ensemble_handle tf.Outpu
 	return op.Output(0)
 }
 
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the logits. It is designed to be used during prediction.
-// It traverses all the trees and calculates the final score for each instance.
-//
-// Arguments:
-//
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Output rank 2 Tensor containing logits for each example.
-func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesPredict",
-		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Aggregates the summary of accumulated stats for the batch.
-//
-// The summary stats contains gradients and hessians accumulated for each node, feature dimension id and bucket.
-//
-// Arguments:
-//	node_ids: int32; Rank 1 Tensor containing node ids for each example, shape [batch_size].
-//	gradients: float32; Rank 2 Tensor (shape=[batch_size, logits_dimension]) with gradients for each example.
-//	hessians: float32; Rank 2 Tensor (shape=[batch_size, hessian_dimension]) with hessians for each example.
-//	feature: int32; Rank 2 feature Tensors (shape=[batch_size, feature_dimension]).
-//	max_splits: int; the maximum number of splits possible in the whole tree.
-//	num_buckets: int; equals to the maximum possible value of bucketized feature.
-//
-// Returns output Rank 4 Tensor (shape=[splits, feature_dimension, buckets, logits_dimension + hessian_dimension])
-// containing accumulated stats for each node, feature dimension and bucket.
-func BoostedTreesAggregateStats(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, feature tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesAggregateStats",
-		Input: []tf.Input{
-			node_ids, gradients, hessians, feature,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Makes the summary of accumulated stats for the batch.
 //
 // The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
@@ -3375,20 +3702,22 @@ func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output)
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Creates a tree ensemble model and returns a handle to it.
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
-//	stamp_token: Token to use as the initial value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
 //
 // Returns the created operation.
-func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesCreateEnsemble",
+		Type: "BoostedTreesDeserializeEnsemble",
 		Input: []tf.Input{
 			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
 		},
@@ -3458,45 +3787,38 @@ func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Out
 	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
 }
 
-// Output the logits for the given input data
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource.
-//	dense_features: Rank 2 dense features tensor.
-//	logits_dimension: Scalar, dimension of the logits.
-//
-// Returns The logits predictions from the tree for each instance in the batch.
-func TensorForestTreePredict(scope *Scope, tree_handle tf.Output, dense_features tf.Output, logits_dimension int64) (logits tf.Output) {
-	if scope.Err() != nil {
-		return
+// BoostedTreesEnsembleResourceHandleOpAttr is an optional argument to BoostedTreesEnsembleResourceHandleOp.
+type BoostedTreesEnsembleResourceHandleOpAttr func(optionalAttr)
+
+// BoostedTreesEnsembleResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpContainer(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreePredict",
-		Input: []tf.Input{
-			tree_handle, dense_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Get the number of nodes in a tree
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource.
-//
-// Returns The size of the tree.
-func TensorForestTreeSize(scope *Scope, tree_handle tf.Output) (tree_size tf.Output) {
+// BoostedTreesEnsembleResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesEnsembleResourceHandleOpSharedName(value string) BoostedTreesEnsembleResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a BoostedTreesEnsembleResource
+func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTreesEnsembleResourceHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorForestTreeSize",
-		Input: []tf.Input{
-			tree_handle,
-		},
+		Type: "BoostedTreesEnsembleResourceHandleOp",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -3522,155 +3844,84 @@ func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_confi
 	return scope.AddOperation(opspec)
 }
 
-// Creates a tree resource and returns a handle to it.
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
 //
 // Arguments:
-//	tree_handle: Handle to the tree resource to be created.
-//	tree_config: Serialized proto string of the boosted_trees.Tree.
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks whether a tree has been initialized.
 //
-// Returns the created operation.
-func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+// Arguments:
+//	tree_handle: Handle to the tree.
+//
+// Returns Whether the tree is initialized.
+func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorForestCreateTreeVariable",
+		Type: "TensorForestTreeIsInitializedOp",
 		Input: []tf.Input{
-			tree_handle, tree_config,
+			tree_handle,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
+// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
+type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
 
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["container"] = value
 	}
 }
 
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
-//
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
-//
-// Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
-//
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+// Creates a handle to a TensorForestTreeResource
+func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
-		Input: []tf.Input{
-			true_classes, sampled_candidates,
-		},
+		Type: "TensorForestTreeResourceHandleOp",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
-
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
 // FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
@@ -3832,32 +4083,32 @@ func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
 //
 // value: If either seed or seed2 are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
 // value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Generates labels for candidate sampling with a log-uniform distribution.
 //
 // See explanations of candidate sampling and the data formats at
 // go/candidate-sampling.
@@ -3886,7 +4137,7 @@ func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCa
 // candidate representing the number of times the candidate is expected
 // to occur in a batch of sampled candidates.  If unique=true, then this is a
 // probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3895,7 +4146,7 @@ func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, n
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
 			true_classes,
 		},
@@ -3905,218 +4156,6 @@ func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, n
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
-
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
-
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
-//
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
-	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
-	}
-}
-
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
-//
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
-//
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
-//
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
-//
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
-//
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
-//
-// Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
-//
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
-		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reshapes a quantized tensor as per the Reshape op.
-//
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
-//
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
-		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Checks a tensor for NaN and Inf values.
-//
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-//
-// Arguments:
-//
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"message": message}
-	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
 type GenerateVocabRemappingAttr func(optionalAttr)
 
@@ -4192,14 +4231,87 @@ func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_fi
 	return op.Output(0), op.Output(1)
 }
 
-// Broadcasts a tensor value to one or more other devices.
-func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// Returns the index of a data point that should be added to the seed set.
+//
+// Entries in distances are assumed to be squared distances of candidate points to
+// the already sampled centers in the seed set. The op constructs one Markov chain
+// of the k-MC^2 algorithm and returns the index of one candidate point to be added
+// as an additional cluster center.
+//
+// Arguments:
+//	distances: Vector with squared distances to the closest previously sampled cluster center
+// for each candidate point.
+//	seed: Scalar. Seed for initializing the random number generator.
+//
+// Returns Scalar with the index of the sampled point.
+func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "KMC2ChainInitialization",
+		Input: []tf.Input{
+			distances, seed,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Selects num_to_sample rows of input using the KMeans++ criterion.
+//
+// Rows of points are assumed to be input points. One row is selected at random.
+// Subsequent rows are sampled with probability proportional to the squared L2
+// distance from the nearest row selected thus far till num_to_sample rows have
+// been sampled.
+//
+// Arguments:
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	num_to_sample: Scalar. The number of rows to sample. This value must not be larger than n.
+//	seed: Scalar. Seed for initializing the random number generator.
+//	num_retries_per_sample: Scalar. For each row that is sampled, this parameter
+// specifies the number of additional points to draw from the current
+// distribution before selecting the best. If a negative value is specified, a
+// heuristic is used to sample O(log(num_to_sample)) additional points.
+//
+// Returns Matrix of shape (num_to_sample, d). The sampled rows.
+func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample tf.Output, seed tf.Output, num_retries_per_sample tf.Output) (samples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "KmeansPlusPlusInitialization",
+		Input: []tf.Input{
+			points, num_to_sample, seed, num_retries_per_sample,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Mutually accumulates multiple tensors of identical type and shape.
+func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "CollectiveBcastSend",
+		Type: "CollectiveGather",
 		Input: []tf.Input{
 			input,
 		},
@@ -4209,32 +4321,16 @@ func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_
 	return op.Output(0)
 }
 
-// CollectiveReduceAttr is an optional argument to CollectiveReduce.
-type CollectiveReduceAttr func(optionalAttr)
-
-// CollectiveReduceWaitFor sets the optional wait_for attribute to value.
-// If not specified, defaults to <>
-func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
-	return func(m optionalAttr) {
-		m["wait_for"] = value
-	}
-}
-
-// Mutually reduces multiple tensors of identical type and shape.
-func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64, optional ...CollectiveReduceAttr) (data tf.Output) {
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CollectiveReduce",
+		Type: "Identity",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -4285,6 +4381,21 @@ func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Forwards the input to the output.
 //
 // This operator represents the loop termination condition used by the
@@ -4385,32 +4496,6 @@ func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAtt
 	return op.Output(0)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
-//
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
-//
-// Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
-//
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Merge",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Forwards `data` to the output port determined by `pred`.
 //
 // If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
@@ -4437,6 +4522,78 @@ func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Outpu
 	return op.Output(0), op.Output(1)
 }
 
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
+
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs beam search decoding on the logits given in input.
+//
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCBeamSearchDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
+}
+
 // CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
 type CTCGreedyDecoderAttr func(optionalAttr)
 
@@ -4490,133 +4647,61 @@ func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output,
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// CTCLossAttr is an optional argument to CTCLoss.
-type CTCLossAttr func(optionalAttr)
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
 
-// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
-//
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
-	}
-}
-
-// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
-//
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
-	}
-}
-
-// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
-//
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
-	}
-}
-
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
-//
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
-//
-// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCLoss",
-		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
-type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
-
-// CudnnRNNParamsToCanonicalRnnMode sets the optional rnn_mode attribute to value.
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
 // If not specified, defaults to "lstm"
-func CudnnRNNParamsToCanonicalRnnMode(value string) CudnnRNNParamsToCanonicalAttr {
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["rnn_mode"] = value
 	}
 }
 
-// CudnnRNNParamsToCanonicalInputMode sets the optional input_mode attribute to value.
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
 // If not specified, defaults to "linear_input"
-func CudnnRNNParamsToCanonicalInputMode(value string) CudnnRNNParamsToCanonicalAttr {
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["input_mode"] = value
 	}
 }
 
-// CudnnRNNParamsToCanonicalDirection sets the optional direction attribute to value.
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
 // If not specified, defaults to "unidirectional"
-func CudnnRNNParamsToCanonicalDirection(value string) CudnnRNNParamsToCanonicalAttr {
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["direction"] = value
 	}
 }
 
-// CudnnRNNParamsToCanonicalDropout sets the optional dropout attribute to value.
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNParamsToCanonicalDropout(value float32) CudnnRNNParamsToCanonicalAttr {
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["dropout"] = value
 	}
 }
 
-// CudnnRNNParamsToCanonicalSeed sets the optional seed attribute to value.
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNParamsToCanonicalSeed(value int64) CudnnRNNParamsToCanonicalAttr {
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// CudnnRNNParamsToCanonicalSeed2 sets the optional seed2 attribute to value.
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Retrieves CudnnRNN params in canonical form.
+// Converts CudnnRNN params from canonical form to usable form.
 //
-// Retrieves a set of weights from the opaque params buffer that can be saved and
-// restored in a way compatible with future runs.
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
 //
 // Note that the params buffer may not be compatible across different GPUs. So any
 // save and restoration should be converted to and from the canonical weights and
@@ -4625,15 +4710,15 @@ func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
 // num_layers: Specifies the number of layers in the RNN model.
 // num_units: Specifies the size of the hidden state.
 // input_size: Specifies the size of the input state.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
 // weights: the canonical form of weights that can be used for saving
 //     and restoration. They are more likely to be compatible across different
 //     generations.
 // biases: the canonical form of biases that can be used for saving
 //     and restoration. They are more likely to be compatible across different
 //     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
 // rnn_mode: Indicates the type of the RNN model.
 // input_mode: Indicate whether there is a linear projection between the input and
 //     The actual computation before the first layer. 'skip_input' is only allowed
@@ -4644,36 +4729,304 @@ func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
 // dropout: dropout probability. When set to 0., dropout is disabled.
 // seed: the 1st part of a seed to initialize dropout.
 // seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_params": num_params}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsToCanonical",
+		Type: "CudnnRNNCanonicalToParams",
 		Input: []tf.Input{
-			num_layers, num_units, input_size, params,
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
-		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV3",
+		Input: []tf.Input{
+			input, input_min, input_max, num_bits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
+type CudnnRNNBackpropV2Attr func(optionalAttr)
+
+// CudnnRNNBackpropV2RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV2RnnMode(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV2InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV2InputMode(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV2Direction(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Dropout(value float32) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Seed(value int64) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Seed2(value int64) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
+//
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
+//     cudnnRNNAlgo_t and cudnnMathType_t.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in the forward operation.
+// host_reserved: The same host_reserved produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV2Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
-		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV2",
+		Input: []tf.Input{
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
+type CudnnRNNV3Attr func(optionalAttr)
+
+// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// CudnnRNNV3TimeMajor sets the optional time_major attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3TimeMajor(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["time_major"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
+//
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
+//     [batch_size, seq_length, input_size].
+// input_h: If time_major is true, this is a 3-D tensor with the shape of
+//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+//     is [batch_size, num_layer * dir, num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
+//     shape is [batch_size, seq_length, dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// time_major: Indicates whether the input/output format is time major or batch
+//     major.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	return weights, biases
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
 // CudnnRNNV2Attr is an optional argument to CudnnRNNV2.
@@ -4899,6 +5252,275 @@ func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Outpu
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
+
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
+//
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "S": S}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsSize",
+		Input: []tf.Input{
+			num_layers, num_units, input_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
+
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
+//
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_random_seed"] = value
+	}
+}
+
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
+//
+// value: Shifts the list of files after the list is randomly
+// shuffled.
+// If not specified, defaults to 0
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_shuffle_shift_ratio"] = value
+	}
+}
+
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+//
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
+	}
+}
+
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+//
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
+	}
+}
+
+// RecordInputBatchSize sets the optional batch_size attribute to value.
+//
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["batch_size"] = value
+	}
+}
+
+// RecordInputCompressionType sets the optional compression_type attribute to value.
+//
+// value: The type of compression for the file. Currently ZLIB and
+// GZIP are supported. Defaults to none.
+// If not specified, defaults to ""
+func RecordInputCompressionType(value string) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Emits randomized records.
+//
+// Arguments:
+//	file_pattern: Glob pattern for the data files.
+//
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RecordInput",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
+
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Aggregates the summary of accumulated stats for the batch.
+//
+// The summary stats contains gradients and hessians accumulated for each node, feature dimension id and bucket.
+//
+// Arguments:
+//	node_ids: int32; Rank 1 Tensor containing node ids for each example, shape [batch_size].
+//	gradients: float32; Rank 2 Tensor (shape=[batch_size, logits_dimension]) with gradients for each example.
+//	hessians: float32; Rank 2 Tensor (shape=[batch_size, hessian_dimension]) with hessians for each example.
+//	feature: int32; Rank 2 feature Tensors (shape=[batch_size, feature_dimension]).
+//	max_splits: int; the maximum number of splits possible in the whole tree.
+//	num_buckets: int; equals to the maximum possible value of bucketized feature.
+//
+// Returns output Rank 4 Tensor (shape=[splits, feature_dimension, buckets, logits_dimension + hessian_dimension])
+// containing accumulated stats for each node, feature dimension and bucket.
+func BoostedTreesAggregateStats(scope *Scope, node_ids tf.Output, gradients tf.Output, hessians tf.Output, feature tf.Output, max_splits int64, num_buckets int64) (stats_summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_splits": max_splits, "num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesAggregateStats",
+		Input: []tf.Input{
+			node_ids, gradients, hessians, feature,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
 type OrderedMapIncompleteSizeAttr func(optionalAttr)
 
@@ -5013,80 +5635,76 @@ func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSi
 	return op.Output(0)
 }
 
-// Reshapes a tensor.
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Given `tensor`, this operation returns a tensor that has the same values
-// as `tensor` with shape `shape`.
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// If one component of `shape` is the special value -1, the size of that dimension
-// is computed so that the total size remains constant.  In particular, a `shape`
-// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the (key, value) element with the smallest
 //
-// If `shape` is 1-D or higher, then the operation returns a tensor with shape
-// `shape` filled with the values of `tensor`. In this case, the number of elements
-// implied by `shape` must be the same as the number of elements in `tensor`.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-// # tensor 't' has shape [9]
-// reshape(t, [3, 3]) ==> [[1, 2, 3],
-//                         [4, 5, 6],
-//                         [7, 8, 9]]
-//
-// # tensor 't' is [[[1, 1], [2, 2]],
-// #                [[3, 3], [4, 4]]]
-// # tensor 't' has shape [2, 2, 2]
-// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-//                         [3, 3, 4, 4]]
-//
-// # tensor 't' is [[[1, 1, 1],
-// #                 [2, 2, 2]],
-// #                [[3, 3, 3],
-// #                 [4, 4, 4]],
-// #                [[5, 5, 5],
-// #                 [6, 6, 6]]]
-// # tensor 't' has shape [3, 2, 3]
-// # pass '[-1]' to flatten 't'
-// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-//
-// # -1 can also be used to infer the shape
-//
-// # -1 is inferred to be 9:
-// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 2:
-// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 3:
-// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-//                               [2, 2, 2],
-//                               [3, 3, 3]],
-//                              [[4, 4, 4],
-//                               [5, 5, 5],
-//                               [6, 6, 6]]]
-//
-// # tensor 't' is [7]
-// # shape `[]` reshapes to a scalar
-// reshape(t, []) ==> 7
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Reshape",
+		Type: "OrderedMapUnstageNoKey",
 		Input: []tf.Input{
-			tensor, shape,
+			indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
 // OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
@@ -5161,66 +5779,49 @@ func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.
 	return values
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
 
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
+// MapClearCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+func MapClearCapacity(value int64) MapClearAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+func MapClearMemoryLimit(value int64) MapClearAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
+// MapClearContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+func MapClearContainer(value string) MapClearAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
+// MapClearSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+func MapClearSharedName(value string) MapClearAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
+// Op removes all elements in the underlying container.
 //
 // Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5229,72 +5830,13 @@ func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
+		Type: "MapClear",
+
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
-
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MapSizeAttr is an optional argument to MapSize.
 type MapSizeAttr func(optionalAttr)
 
@@ -5424,50 +5966,50 @@ func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, opti
 	return key, values
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
 
-// MapUnstageCapacity sets the optional capacity attribute to value.
+// MapPeekCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
+func MapPeekCapacity(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapUnstageContainer sets the optional container attribute to value.
+// MapPeekContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
+func MapPeekContainer(value string) MapPeekAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapUnstageSharedName sets the optional shared_name attribute to value.
+// MapPeekSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
+func MapPeekSharedName(value string) MapPeekAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes and returns the values associated with the key
+// Op peeks at the values at the specified key.  If the
 //
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5476,7 +6018,7 @@ func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.Data
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstage",
+		Type: "MapPeek",
 		Input: []tf.Input{
 			key, indices,
 		},
@@ -5489,7 +6031,7 @@ func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.Data
 	var idx int
 	var err error
 	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
+		scope.UpdateErr("MapPeek", err)
 		return
 	}
 	return values
@@ -5570,66 +6112,49 @@ func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output
 	return scope.AddOperation(opspec)
 }
 
-// Receives a tensor value broadcast from another device.
-func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveBcastRecv",
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
 
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StagePeekAttr is an optional argument to StagePeek.
-type StagePeekAttr func(optionalAttr)
-
-// StagePeekCapacity sets the optional capacity attribute to value.
+// StageClearCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func StagePeekCapacity(value int64) StagePeekAttr {
+func StageClearCapacity(value int64) StageClearAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func StagePeekMemoryLimit(value int64) StagePeekAttr {
+func StageClearMemoryLimit(value int64) StageClearAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// StagePeekContainer sets the optional container attribute to value.
+// StageClearContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func StagePeekContainer(value string) StagePeekAttr {
+func StageClearContainer(value string) StageClearAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// StagePeekSharedName sets the optional shared_name attribute to value.
+// StageClearSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func StagePeekSharedName(value string) StagePeekAttr {
+func StageClearSharedName(value string) StageClearAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op peeks at the values at the specified index.  If the
+// Op removes all elements in the underlying container.
 //
-// underlying container does not contain sufficient elements
-// this op will block until it does.   This Op is optimized for
-// performance.
-func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5638,10 +6163,67 @@ func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StagePeek",
-		Input: []tf.Input{
-			index,
-		},
+		Type: "StageClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
+
+// UnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
+//
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unstage",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -5651,73 +6233,82 @@ func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...
 	var idx int
 	var err error
 	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("StagePeek", err)
+		scope.UpdateErr("Unstage", err)
 		return
 	}
 	return values
 }
 
-// StageAttr is an optional argument to Stage.
-type StageAttr func(optionalAttr)
-
-// StageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageCapacity(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageMemoryLimit sets the optional memory_limit attribute to value.
-//
-// value: The maximum number of bytes allowed for Tensors in the Staging Area.
-// If > 0, inserts will block until sufficient space is available.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageMemoryLimit(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func StageContainer(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func StageSharedName(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage values similar to a lightweight Enqueue.
-//
-// The basic functionality of this Op is similar to a queue with many
-// fewer capabilities and options.  This Op is optimized for performance.
+// Delete the tensor specified by its handle in the session.
 //
 // Arguments:
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
+//	handle: The handle for a tensor stored in the session state.
 //
 // Returns the created operation.
-func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeleteSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// GatherV2Attr is an optional argument to GatherV2.
+type GatherV2Attr func(optionalAttr)
+
+// GatherV2BatchDims sets the optional batch_dims attribute to value.
+// If not specified, defaults to 0
+func GatherV2BatchDims(value int64) GatherV2Attr {
+	return func(m optionalAttr) {
+		m["batch_dims"] = value
+	}
+}
+
+// Gather slices from `params` axis `axis` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
+//
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+//
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+//
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// See also `tf.batch_gather` and `tf.gather_nd`.
+//
+// Arguments:
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output, optional ...GatherV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5726,13 +6317,14 @@ func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Opera
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Stage",
+		Type: "GatherV2",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			params, indices, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // Get the value of the tensor specified by its handle.
@@ -5758,75 +6350,93 @@ func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value
 	return op.Output(0)
 }
 
-// Splits a tensor into `num_split` tensors along one dimension.
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+//
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
+	}
+}
+
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+//
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
+//
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
+//
+// The remappings are 1-D tensors with the following properties:
+//
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
+//
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
+//
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
 // Arguments:
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//	value: The tensor to split.
-//	num_split: The number of ways to split.  Must evenly divide
-// `value.shape[split_dim]`.
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns They are identically shaped tensors, whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `values.shape[split_dim] / num_split`.
-func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Split",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			axis, value,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Split", err)
-		return
-	}
-	return output
-}
-
-// Deprecated. Use TensorArrayCloseV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
-//
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Deprecated. Use TensorArraySplitV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
@@ -5861,37 +6471,51 @@ func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtyp
 	return op.Output(0), op.Output(1)
 }
 
-// Deprecated. Use TensorArrayScatterV3
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
+
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayGatherV3
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
+		Type: "TensorArrayGatherV2",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			handle, indices, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayReadV3
+// Deprecated. Use TensorArrayGradV3
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			handle, index, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -5973,28 +6597,6 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Get the current size of the TensorArray.
 //
 // Arguments:
@@ -6083,72 +6685,24 @@ func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, val
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
-
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Gather specific elements from the TensorArray into output `value`.
-//
-// All elements selected by `indices` must have the same shape.
+// Push an element onto the tensor_array.
 //
 // Arguments:
 //	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
 //	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "TensorArrayWriteV3",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			handle, index, value, flow_in,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -6185,6 +6739,26 @@ func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
+// Returns a tensor of ones with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OnesLike",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a TensorArray for storing the gradients of values in the given handle.
 //
 // If the given TensorArray gradient already exists, returns a reference to it.
@@ -6245,137 +6819,207 @@ func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0), op.Output(1)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
 
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
-//
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
+// GatherValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+func GatherValidateIndices(value bool) GatherAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+// Gather slices from `params` according to `indices`.
 //
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
-	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Write data via Write and read via Read or Pack.
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
+//
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "Gather",
 		Input: []tf.Input{
-			size,
+			params, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Pads a tensor.
+// Delete the stack from its resource container.
 //
-// This operation pads `input` according to the `paddings` and `constant_values`
-// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many padding values to add before the contents of `input` in that dimension,
-// and `paddings[D, 1]` indicates how many padding values to add after the contents
-// of `input` in that dimension. `constant_values` is a scalar tensor of the same
-// type as `input` that indicates the value to use for padding `input`.
+// Arguments:
+//	handle: The handle to a stack.
 //
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # 'constant_values' is 0
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "PadV2",
+		Type: "StackCloseV2",
 		Input: []tf.Input{
-			input, paddings, constant_values,
+			handle,
 		},
 	}
+	return scope.AddOperation(opspec)
+}
+
+// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
+type CudnnRNNBackpropV3Attr func(optionalAttr)
+
+// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNBackpropV3TimeMajor sets the optional time_major attribute to value.
+// If not specified, defaults to true
+func CudnnRNNBackpropV3TimeMajor(value bool) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["time_major"] = value
+	}
+}
+
+// Backprop step of CudnnRNNV3.
+//
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "sequence_lengths" input than CudnnRNNBackprop.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
+//     [batch_size, seq_length, input_size].
+// input_h: If time_major is true, this is a 3-D tensor with the shape of
+//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
+//     is [batch_size, num_layer * dir, num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: If time_major is true, this is a 3-D tensor with the shape of
+//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
+//     shape is [batch_size, seq_length, dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// time_major: Indicates whether the input/output format is time major or batch
+//     major.
+// reserve_space: The same reserve_space produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
 // StackPushV2Attr is an optional argument to StackPushV2.
@@ -6417,181 +7061,45 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 	return op.Output(0)
 }
 
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
 
-// StackV2StackName sets the optional stack_name attribute to value.
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["stack_name"] = value
+		m["element_shape"] = value
 	}
 }
 
-// A stack that produces elements in first-in last-out order.
+// Gather specific elements from the TensorArray into output `value`.
+//
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			max_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inserts a dimension of 1 into a tensor's shape.
-//
-// Given a tensor `input`, this operation inserts a dimension of 1 at the
-// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
-// zero; if you specify a negative number for `axis` it is counted backward from
-// the end.
-//
-// This operation is useful if you want to add a batch dimension to a single
-// element. For example, if you have a single image of shape `[height, width,
-// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-// which will make the shape `[1, height, width, channels]`.
-//
-// Other examples:
-//
-// ```
-// # 't' is a tensor of shape [2]
-// shape(expand_dims(t, 0)) ==> [1, 2]
-// shape(expand_dims(t, 1)) ==> [2, 1]
-// shape(expand_dims(t, -1)) ==> [2, 1]
-//
-// # 't2' is a tensor of shape [2, 3, 5]
-// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-// ```
-//
-// This operation requires that:
-//
-// `-1-input.dims() <= dim <= input.dims()`
-//
-// This operation is related to `squeeze()`, which removes dimensions of
-// size 1.
-//
-// Arguments:
-//
-//	axis: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`. Must be in the range
-// `[-rank(input) - 1, rank(input)]`.
-//
-// Returns Contains the same data as `input`, but its shape has an additional
-// dimension of size 1 added.
-func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExpandDims",
-		Input: []tf.Input{
-			input, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
-//
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EditDistance",
-		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -6620,48 +7128,24 @@ func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	return op.Output(0)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
-
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
-//
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
-	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
-	}
-}
-
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// Checks whether a tree ensemble has been initialized.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	tree_ensemble_handle: Handle to the tree ensemble resouce.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+// Returns output boolean on whether it is initialized or not.
+func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "IsBoostedTreesEnsembleInitialized",
 		Input: []tf.Input{
-			handle,
+			tree_ensemble_handle,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
@@ -6733,92 +7217,206 @@ func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	return components
 }
 
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
-
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
+// SpaceToBatch for N-D tensors of type T.
 //
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_random_seed"] = value
-	}
-}
-
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
-//
-// value: Shifts the list of files after the list is randomly
-// shuffled.
-// If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
-	}
-}
-
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
-//
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
-	}
-}
-
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
-//
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_parallelism"] = value
-	}
-}
-
-// RecordInputBatchSize sets the optional batch_size attribute to value.
-//
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["batch_size"] = value
-	}
-}
-
-// RecordInputCompressionType sets the optional compression_type attribute to value.
-//
-// value: The type of compression for the file. Currently ZLIB and
-// GZIP are supported. Defaults to none.
-// If not specified, defaults to ""
-func RecordInputCompressionType(value string) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Emits randomized records.
+// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+// grid of blocks of shape `block_shape`, and interleaves these blocks with the
+// "batch" dimension (0) such that in the output, the spatial dimensions
+// `[1, ..., M]` correspond to the position within the grid, and the batch
+// dimension combines both the position within a spatial block and the original
+// batch position.  Prior to division into blocks, the spatial dimensions of the
+// input are optionally zero padded according to `paddings`.  See below for a
+// precise description.
 //
 // Arguments:
-//	file_pattern: Glob pattern for the data files.
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has `M` dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
 //
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+// This operation is equivalent to the following steps:
+//
+// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+//    input according to `paddings` to produce `padded` of shape `padded_shape`.
+//
+// 2. Reshape `padded` to `reshaped_padded` of shape:
+//
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//        block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1],
+//       block_shape[M-1]] +
+//      remaining_shape
+//
+// 3. Permute dimensions of `reshaped_padded` to produce
+//    `permuted_reshaped_padded` of shape:
+//
+//      block_shape +
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+//    dimension, producing an output tensor of shape:
+//
+//      [batch * prod(block_shape)] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+//     paddings = `[[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 3, 1]` and value:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	opspec := tf.OpSpec{
+		Type: "SpaceToBatchND",
+		Input: []tf.Input{
+			input, block_shape, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
+
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues `n` tuples of one or more tensors from the given queue.
+//
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
+//
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
-
+		Type: "QueueDequeueManyV2",
+		Input: []tf.Input{
+			handle, n,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
+		return
+	}
+	return components
 }
 
 // QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
@@ -7004,75 +7602,214 @@ func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV
 	return op.Output(0)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
-
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+// BatchToSpace for N-D tensors of type T.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// FIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
 //
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+// This operation is equivalent to the following steps:
+//
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
+//
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "BatchToSpaceND",
+		Input: []tf.Input{
+			input, block_shape, crops,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Interleave the values from the `data` tensors into a single tensor.
+//
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
-		Attrs: attrs,
+		Type: "ParallelDynamicStitch",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(data),
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -7156,6 +7893,79 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LearnedUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
 // For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
@@ -7277,20 +8087,58 @@ func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output,
 	return op.Output(0)
 }
 
-// Produces a string handle for the given MultiDeviceIterator.
+// Gets next element for the provided shard number.
 //
 // Arguments:
 //	multi_device_iterator: A MultiDeviceIterator resource.
+//	shard_num: Integer representing which shard to fetch data for.
+//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
 //
-// Returns A string representing the resource.
-func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
+// Returns Result of the get_next on the dataset.
+func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIteratorGetNextFromShard",
+		Input: []tf.Input{
+			multi_device_iterator, shard_num, incarnation_id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
+		return
+	}
+	return components
+}
+
+// Initializes the multi device iterator with the given dataset.
+//
+// Arguments:
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+//
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorToStringHandle",
+		Type: "MultiDeviceIteratorInit",
 		Input: []tf.Input{
-			multi_device_iterator,
+			dataset, multi_device_iterator, max_buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -7323,16 +8171,25 @@ func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, con
 	return op.Output(0)
 }
 
-// Returns the value stored in an Optional variant or raises an error if none exists.
-func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Generate the bucket boundaries for each feature based on accumulated summaries.
+//
+// An op that returns a list of float tensors for a quantile stream resource. Each
+// tensor is Rank 1 containing bucket boundaries for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_features: inferred int; number of features to get bucket boundaries for.
+//
+// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_features": num_features}
 	opspec := tf.OpSpec{
-		Type: "OptionalGetValue",
+		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
 		Input: []tf.Input{
-			optional,
+			quantile_stream_resource_handle,
 		},
 		Attrs: attrs,
 	}
@@ -7342,11 +8199,78 @@ func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataTy
 	}
 	var idx int
 	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("OptionalGetValue", err)
+	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
+		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
 		return
 	}
-	return components
+	return bucket_boundaries
+}
+
+// ModelDatasetAttr is an optional argument to ModelDataset.
+type ModelDatasetAttr func(optionalAttr)
+
+// ModelDatasetCpuBudget sets the optional cpu_budget attribute to value.
+// If not specified, defaults to 0
+func ModelDatasetCpuBudget(value int64) ModelDatasetAttr {
+	return func(m optionalAttr) {
+		m["cpu_budget"] = value
+	}
+}
+
+// Identity transformation that models performance.
+//
+// Identity transformation that models performance.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//
+//
+func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ModelDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ModelDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gets the next output from the given iterator as an Optional variant.
+func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNextAsOptional",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalNone",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // OptimizeDatasetAttr is an optional argument to OptimizeDataset.
@@ -7410,39 +8334,19 @@ func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
 	return op.Output(0)
 }
 
-// Converts the given variant tensor to an iterator and stores it in the given resource.
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
 // Arguments:
 //	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
+//
+// Returns A variant tensor storing the state of the iterator contained in the
 // resource.
-//
-// Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
-		Input: []tf.Input{
-			resource_handle, serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Converts the given `resource_handle` representing an iterator to a string.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
+		Type: "SerializeIterator",
 		Input: []tf.Input{
 			resource_handle,
 		},
@@ -7451,81 +8355,6 @@ func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_han
 	return op.Output(0)
 }
 
-// Outputs the single element from the given dataset.
-//
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
-//
-//
-//
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
-		Input: []tf.Input{
-			dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
-	}
-	return components
-}
-
-// Concatenates a list of `N` tensors along the first dimension.
-//
-// The input tensors are all required to have size 1 in the first dimension.
-//
-// For example:
-//
-// ```
-// # 'x' is [[1, 4]]
-// # 'y' is [[2, 5]]
-// # 'z' is [[3, 6]]
-// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// ```
-//
-// The difference between concat and parallel_concat is that concat requires all
-// of the inputs be computed before the operation will begin but doesn't require
-// that the input shapes be known during graph construction.  Parallel concat
-// will copy pieces of the input into the output as they become available, in
-// some situations this can provide a performance benefit.
-//
-// Arguments:
-//	values: Tensors to be concatenated. All must have size 1 in the first dimension
-// and same shape.
-//	shape: the final shape of the result; should be equal to the shapes of any input
-// but with the number of input values in the first dimension.
-//
-// Returns The concatenated tensor.
-func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "ParallelConcat",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Gets the next output from the given iterator.
 //
 // This operation is a synchronous version IteratorGetNext. It should only be used
@@ -7557,81 +8386,108 @@ func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.Dat
 	return components
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
-//
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Gets the next output from the given iterator .
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
+		Type: "IteratorGetNext",
 		Input: []tf.Input{
-			x, y,
+			iterator,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
 }
 
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+// A container for an iterator resource.
 //
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
+// Arguments:
+//	handle: A handle to the iterator to delete.
+//	deleter: A variant deleter.
 //
 // Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+func DeleteIterator(scope *Scope, handle tf.Output, deleter tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MakeIterator",
+		Type: "DeleteIterator",
 		Input: []tf.Input{
-			dataset, iterator,
+			handle, deleter,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// A container for an iterator resource.
-//
-// Returns A handle to the iterator that can be passed to a "MakeIterator" or
-// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
-// resource sharing by name, and does not keep a reference to the resource
-// container.A variant deleter that should be passed into the op that deletes the iterator.
-func AnonymousIteratorV2(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output, deleter tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "AnonymousIteratorV2",
+// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
+type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
 
-		Attrs: attrs,
+// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 16, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// A container for an iterator resource.
+// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator" or
-// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
-// resource sharing by name, and does not keep a reference to the resource
-// container.
-func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
+// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
+//   same as `gradients`.
+// min, max: Quantization interval, floats of shape `[d]`.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs, shape same as
+// `inputs`:
+//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
+// `sum_per_d(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AnonymousIterator",
-
+		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // A container for an iterator resource.
@@ -7652,23 +8508,25 @@ func Iterator(scope *Scope, shared_name string, container string, output_types [
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the records from one or more TFRecord files.
+// Creates a dataset that emits the records from one or more binary files.
 //
 // Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
 // read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -7726,6 +8584,92 @@ func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, out
 	return op.Output(0)
 }
 
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
+//
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EditDistance",
+		Input: []tf.Input{
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ShuffleDatasetAttr is an optional argument to ShuffleDataset.
 type ShuffleDatasetAttr func(optionalAttr)
 
@@ -7775,17 +8719,6 @@ func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output
 	return op.Output(0)
 }
 
-// PaddedBatchDatasetV2Attr is an optional argument to PaddedBatchDatasetV2.
-type PaddedBatchDatasetV2Attr func(optionalAttr)
-
-// PaddedBatchDatasetV2ParallelCopy sets the optional parallel_copy attribute to value.
-// If not specified, defaults to false
-func PaddedBatchDatasetV2ParallelCopy(value bool) PaddedBatchDatasetV2Attr {
-	return func(m optionalAttr) {
-		m["parallel_copy"] = value
-	}
-}
-
 // Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
@@ -7798,21 +8731,16 @@ func PaddedBatchDatasetV2ParallelCopy(value bool) PaddedBatchDatasetV2Attr {
 // padded to the maximum size of all batch elements.
 //	padding_values: A list of scalars containing the padding value to use for
 // each of the outputs.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
 //
-func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape, optional ...PaddedBatchDatasetV2Attr) (handle tf.Output) {
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDatasetV2",
+		Type: "PaddedBatchDataset",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
 		},
 		Attrs: attrs,
 	}
@@ -7820,6 +8748,28 @@ func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.O
 	return op.Output(0)
 }
 
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
+//
+// The runtime is then free to make optimizations based on this.
+//
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
+//
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GuaranteeConst",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ShardDatasetAttr is an optional argument to ShardDataset.
 type ShardDatasetAttr func(optionalAttr)
 
@@ -7950,102 +8900,16 @@ func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Push an element onto the tensor_array.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
+func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalSqlDataset",
+		Type: "FilterByLastComponentDataset",
 		Input: []tf.Input{
-			driver_name, data_source_name, query,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -8053,26 +8917,80 @@ func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_nam
 	return op.Output(0)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// BoostedTreesCalculateBestFeatureSplitAttr is an optional argument to BoostedTreesCalculateBestFeatureSplit.
+type BoostedTreesCalculateBestFeatureSplitAttr func(optionalAttr)
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
+// BoostedTreesCalculateBestFeatureSplitSplitType sets the optional split_type attribute to value.
+//
+// value: A string indicating if this Op should perform inequality split or equality split.
+// If not specified, defaults to "inequality"
+func BoostedTreesCalculateBestFeatureSplitSplitType(value string) BoostedTreesCalculateBestFeatureSplitAttr {
+	return func(m optionalAttr) {
+		m["split_type"] = value
+	}
+}
+
+// Calculates gains for each feature and returns the best possible split information for the feature.
+//
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+//
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+//
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+//
+// The output shapes are compatible in a way that the first dimension of all tensors are the same and equal to the number of possible split nodes for each feature.
+//
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary: A Rank 4 tensor (#shape=[max_splits, feature_dims, bucket, stats_dims]) for accumulated stats summary (gradient/hessian) per node, per dimension, per buckets for each feature.
+// The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	logits_dimension: The dimension of logit, i.e., number of classes.
+//
+// Returns A Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.A Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.A Rank 1 tensors indicating the best feature dimension for each feature to split for certain nodes if the feature is multi-dimension. See above for details like shapes and sizes.A Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.A Rank 1 tensors indicating the which direction to go if data is missing. See above for details like shapes and sizes.
+func BoostedTreesCalculateBestFeatureSplit(scope *Scope, node_id_range tf.Output, stats_summary tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, logits_dimension int64, optional ...BoostedTreesCalculateBestFeatureSplitAttr) (node_ids tf.Output, gains tf.Output, feature_dimensions tf.Output, thresholds tf.Output, left_node_contribs tf.Output, right_node_contribs tf.Output, split_with_default_directions tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCalculateBestFeatureSplit",
+		Input: []tf.Input{
+			node_id_range, stats_summary, l1, l2, tree_complexity, min_node_weight,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
 // If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
 		m["out_type"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
 // Arguments:
 //
+//
 //	min_features: The float value that the lowest quantized value represents.
 //	max_features: The float value that the highest quantized value represents.
 //
 // Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8081,9 +8999,9 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
@@ -8091,121 +9009,51 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the element-wise max of two SparseTensors.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "NonMaxSuppressionV3",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
-
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
-//
-// This operation would return:
-//
-// ```
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
-//
-// Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
-//
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ListDiff",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
-//
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
-//
-// Arguments:
-//
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
-//
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
-		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
 // Produces the average pool of the input tensor for quantized types.
@@ -8237,46 +9085,37 @@ func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_in
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["output_type"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Usage:
+//   ```python
+//   import tensorflow as tf
+//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+//   b = tf.math.argmax(input = a)
+//   c = tf.keras.backend.eval(b)
+//   # c = 4
+//   # here a[4] = 166.32 which is the largest element of a across axis 0
+//   ```
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8285,9 +9124,9 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -8295,140 +9134,35 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
 
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["use_locking"] = value
 	}
 }
 
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
-
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// The backward operation for "BiasAdd" on the "bias" tensor.
-//
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
-//
-// Arguments:
-//	out_backprop: Any number of dimensions.
-//
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8437,181 +9171,222 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			out_backprop,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Decodes a `variant` Tensor into a `RaggedTensor`.
+//
+// Decodes the given `variant` Tensor and returns a `RaggedTensor`. The input
+// could be a scalar, meaning it encodes a single `RaggedTensor` with ragged_rank
+// `output_ragged_rank`. It could also have an arbitrary rank, in which case each
+// element is decoded into a `RaggedTensor` with ragged_rank `input_ragged_rank`
+// and these are then stacked according to the input shape to output a single
+// `RaggedTensor` with ragged_rank `output_ragged_rank`. Each `variant` element in
+// the input Tensor is decoded by retrieving from the element a 1-D `variant`
+// Tensor with `input_ragged_rank + 1` Tensors, corresponding to the splits and
+// values of the decoded `RaggedTensor`. If `input_ragged_rank` is -1, then it is
+// inferred as `output_ragged_rank` - `rank(encoded_ragged)`. See
+// `RaggedTensorToVariant` for the corresponding encoding logic.
+//
+//
+// Arguments:
+//	encoded_ragged: A `variant` Tensor containing encoded `RaggedTensor`s.
+//	input_ragged_rank: The ragged rank of each encoded `RaggedTensor` component in the input. If set to
+// -1, this is inferred as `output_ragged_rank` - `rank(encoded_ragged)`
+//	output_ragged_rank: The expected ragged rank of the output `RaggedTensor`. The following must hold:
+// `output_ragged_rank = rank(encoded_ragged) + input_ragged_rank`.
+//
+//
+//
+// Returns A list of one or more Tensors representing the splits of the output
+// `RaggedTensor`.A Tensor representing the values of the output `RaggedTensor`.
+func RaggedTensorFromVariant(scope *Scope, encoded_ragged tf.Output, input_ragged_rank int64, output_ragged_rank int64, Tvalues tf.DataType, Tsplits tf.DataType) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_ragged_rank": input_ragged_rank, "output_ragged_rank": output_ragged_rank, "Tvalues": Tvalues, "Tsplits": Tsplits}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorFromVariant",
+		Input: []tf.Input{
+			encoded_ragged,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedTensorFromVariant", err)
+		return
+	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
-// Provides the time since epoch in seconds.
+// Reverses specific dimensions of a tensor.
 //
-// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
 //
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
-func Timestamp(scope *Scope) (ts tf.Output) {
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Timestamp",
+		Type: "Reverse",
+		Input: []tf.Input{
+			tensor, dims,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
+// UnbatchAttr is an optional argument to Unbatch.
+type UnbatchAttr func(optionalAttr)
 
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+// UnbatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchContainer(value string) UnbatchAttr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["container"] = value
 	}
 }
 
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+// UnbatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchSharedName(value string) UnbatchAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["shared_name"] = value
 	}
 }
 
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+// Reverses the operation of Batch for a single output Tensor.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
+// An instance of Unbatch either receives an empty batched_tensor, in which case it
+// asynchronously waits until the values become available from a concurrently
+// running instance of Unbatch with the same container and shared_name, or receives
+// a non-empty batched_tensor in which case it finalizes all other concurrently
+// running instances and outputs its own element from the batch.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
-//
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
-//
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// batched_tensor: The possibly transformed output of Batch. The size of the first
+//  dimension should remain unchanged by the transformations for the operation to
+//  work.
+// batch_index: The matching batch_index obtained from Batch.
+// id: The id scalar emitted by Batch.
+// unbatched_tensor: The Tensor corresponding to this execution.
+// timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+//  batched input tensor associated with a given invocation of the op.
+// container: Container to control resource sharing.
+// shared_name: Instances of Unbatch with the same container and shared_name are
+//  assumed to possibly belong to the same batch. If left empty, the op name will
+//  be used as the shared name.
+func Unbatch(scope *Scope, batched_tensor tf.Output, batch_index tf.Output, id tf.Output, timeout_micros int64, optional ...UnbatchAttr) (unbatched_tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	attrs := map[string]interface{}{"timeout_micros": timeout_micros}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "Unbatch",
 		Input: []tf.Input{
-			value,
+			batched_tensor, batch_index, id,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
-type LeakyReluGradAttr func(optionalAttr)
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
 
-// LeakyReluGradAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
+// NthElementReverse sets the optional reverse attribute to value.
+//
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["reverse"] = value
 	}
 }
 
-// Computes rectified linear gradients for a LeakyRelu operation.
+// Finds values of the `n`-th order statistic for the last dimension.
+//
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
+//
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+//
+//     values.shape = input.shape[:-1]
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
-//	features: The features passed as input to the corresponding LeakyRelu operation,
-// OR the outputs of that operation (both work equivalently).
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
-func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8620,9 +9395,9 @@ func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LeakyReluGrad",
+		Type: "NthElement",
 		Input: []tf.Input{
-			gradients, features,
+			input, n,
 		},
 		Attrs: attrs,
 	}
@@ -8630,19 +9405,33 @@ func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, option
 	return op.Output(0)
 }
 
-// Returns 0 if the denominator is zero.
+// Return the shape of s0 op s1 with broadcast.
 //
-//
-// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DivNoNan",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			x, y,
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -8678,227 +9467,6 @@ func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, label
 	return op.Output(0), op.Output(1)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a sequence of numbers.
-//
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
-//
-// For example:
-//
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
-//
-// Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
-//
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Range",
-		Input: []tf.Input{
-			start, limit, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_min(c, tf.constant([0, 0, 1]))
-// # ==> [[1, 2, 2, 1],
-// #      [5, 6, 7, 8]]
-// ```
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMin",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BoostedTreesQuantileStreamResourceFlushAttr is an optional argument to BoostedTreesQuantileStreamResourceFlush.
-type BoostedTreesQuantileStreamResourceFlushAttr func(optionalAttr)
-
-// BoostedTreesQuantileStreamResourceFlushGenerateQuantiles sets the optional generate_quantiles attribute to value.
-//
-// value: bool; If True, the output will be the num_quantiles for each stream where the ith
-// entry is the ith quantile of the input with an approximation error of epsilon.
-// Duplicate values may be present.
-// If False, the output will be the points in the histogram that we got which roughly
-// translates to 1/epsilon boundaries and without any duplicates.
-// Default to False.
-// If not specified, defaults to false
-func BoostedTreesQuantileStreamResourceFlushGenerateQuantiles(value bool) BoostedTreesQuantileStreamResourceFlushAttr {
-	return func(m optionalAttr) {
-		m["generate_quantiles"] = value
-	}
-}
-
-// Flush the summaries for a quantile stream resource.
-//
-// An op that flushes the summaries for a quantile stream resource.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_buckets: int; approximate number of buckets unless using generate_quantiles.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resource_handle tf.Output, num_buckets tf.Output, optional ...BoostedTreesQuantileStreamResourceFlushAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceFlush",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, num_buckets,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
-//
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Replaces the contents of the table with the specified keys and values.
 //
 // The tensor `keys` must be of the same type as the keys of the table.
@@ -8923,13 +9491,83 @@ func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, v
 	return scope.AddOperation(opspec)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Computes log softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu",
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign gradients for a softsign operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
+//
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftsignGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+//
+// Arguments:
+//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
+// `N` data inputs should produce the next output element.
+//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
+// the values of `selector_input_dataset`.
+//
+//
+func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDirectedInterleaveDataset",
+		Input: []tf.Input{
+			selector_input_dataset, tf.OutputList(data_input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softplus",
 		Input: []tf.Input{
 			features,
 		},
@@ -8938,84 +9576,90 @@ func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// if < 0, `scale * features` otherwise.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "Selu",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradGradWithArgmaxAttr is an optional argument to MaxPoolGradGradWithArgmax.
-type MaxPoolGradGradWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolGradGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+// Selects the k nearest centers for each point.
 //
-// value: Whether to include batch dimension in flattened index of `argmax`.
-// If not specified, defaults to false
-func MaxPoolGradGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradGradWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
+// Rows of points are assumed to be input points. Rows of centers are assumed to be
+// the list of candidate centers. For each point, the k centers that have least L2
+// distance to it are computed.
 //
 // Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
+//	k: Number of nearest centers to return for each point. If k is larger than m, then
+// only m centers are returned.
 //
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradWithArgmaxAttr) (output tf.Output) {
+// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+// corresponding center in nearest_center_indices.
+func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "NearestNeighbors",
+		Input: []tf.Input{
+			points, centers, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
+		Type: "RegexFullMatch",
 		Input: []tf.Input{
-			input, grad, argmax,
+			input, pattern,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
@@ -9023,7 +9667,7 @@ type MaxPoolGradGradAttr func(optionalAttr)
 // Alternatively, the format could be "NCHW", the data storage order of:
 //     [batch, in_channels, in_height, in_width].
 // If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
@@ -9041,18 +9685,18 @@ func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 //	padding: The type of padding algorithm to use.
 //
 // Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "MaxPoolGradGradV2",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -9060,31 +9704,23 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Enqueue a Tensor on the computation outfeed.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	input: A tensor that will be inserted into the outfeed queue.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns the created operation.
+func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "OutfeedEnqueue",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
@@ -9135,299 +9771,10 @@ func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
-//
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
-//
-//
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
-		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SpaceToDepthAttr is an optional argument to SpaceToDepth.
-type SpaceToDepthAttr func(optionalAttr)
-
-// SpaceToDepthDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// SpaceToDepth for tensors of type T.
-//
-// Rearranges blocks of spatial data, into depth. More specifically,
-// this op outputs a copy of the input tensor where values from the `height`
-// and `width` dimensions are moved to the `depth` dimension.
-// The attr `block_size` indicates the input block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` are rearranged
-//     into depth at each location.
-//   * The depth of the output tensor is `block_size * block_size * input_depth`.
-//   * The Y, X coordinates within each block of the input become the high order
-//     component of the output channel index.
-//   * The input tensor's height and width must be divisible by block_size.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
-//                         within the output image, bX, bY means coordinates
-//                         within the input block, iC means input channels).
-//      The output would be a transpose to the following layout:
-//      n,oY,oX,bY,bX,iC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1], [2]],
-//       [[3], [4]]]]
-// ```
-//
-// This operation will output a tensor of shape `[1, 1, 1, 4]`:
-//
-// ```
-// [[[[1, 2, 3, 4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-// the corresponding output will have a single element (i.e. width and height are
-// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-// The output element shape is `[1, 1, 4]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// This operation, for block_size of 2, will return the following tensor of shape
-// `[1, 1, 1, 12]`
-//
-// ```
-// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [5],  [6]],
-//       [[3],   [4],  [7],  [8]],
-//       [[9],  [10], [13],  [14]],
-//       [[11], [12], [15],  [16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 2 2 4]`:
-//
-// ```
-// x = [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
-//
-// Arguments:
-//
-//	block_size: The size of the spatial block.
-func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SpaceToDepth",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
-
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
-//
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["iou_threshold"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
-		Input: []tf.Input{
-			boxes, scores, max_output_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-//
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Qr",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
-
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
 // value: The data format of the input and output data. With the
 // default format "NDHWC", the data is stored in the order of:
@@ -9435,24 +9782,24 @@ type MaxPool3DAttr func(optionalAttr)
 // Alternatively, the format could be "NCDHW", the data storage order is:
 //     [batch, in_channels, in_depth, in_height, in_width].
 // If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// Computes gradients of max pooling function.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
 //	ksize: 1-D tensor of length 5. The size of the window for each dimension of
 // the input tensor. Must have `ksize[0] = ksize[4] = 1`.
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9461,9 +9808,9 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			input,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -9471,200 +9818,306 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Asinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
 //
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```
-// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
-		Input: []tf.Input{
-			input, crops,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A dataset that splits the elements of its input into multiple elements.
-func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalUnbatchDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
-
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageSize",
-
+		Type: "Conv3DBackpropInputV2",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// Read an element from the TensorArray into output `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV3",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+//
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
+type NonMaxSuppressionV4Attr func(optionalAttr)
+
+// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+//
+// value: If true, the output `selected_indices` is padded to be of length
+// `max_output_size`. Defaults to false.
+// If not specified, defaults to false
+func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+	return func(m optionalAttr) {
+		m["pad_to_max_output_size"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
+// `selected_indices`, with the valid elements appearing first.
+func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionV4",
+		Input: []tf.Input{
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TopKV2",
+		Input: []tf.Input{
+			input, k,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
@@ -9708,131 +10161,88 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
+// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
+type StatefulStandardNormalAttr func(optionalAttr)
 
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+// StatefulStandardNormalDtype sets the optional dtype attribute to value.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["dtype"] = value
 	}
 }
 
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+// Outputs random values from a normal distribution. This op is deprecated in favor of op 'StatefulStandardNormalV2'
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+// DEPRECATED at GraphDef version 29: Use StatefulStandardNormalV2 instead
 //
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	shape: The shape of the output tensor.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
+		Type: "StatefulStandardNormal",
+		Input: []tf.Input{
+			resource, shape,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
+//     [batch, in_height, in_width, in_channels].
 // Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
+//     [batch, in_channels, in_height, in_width].
 // If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
 //
 // value: 1-D tensor of length 4.  The dilation factor for each dimension of
 // `input`. If set to k > 1, there will be k-1 skipped cells between each filter
@@ -9840,33 +10250,30 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
 // If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
+// Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
 // Gradients w.r.t. the output of the convolution.
 //	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
 //	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9875,9 +10282,9 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -9885,638 +10292,251 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// LeakyReluAttr is an optional argument to LeakyRelu.
-type LeakyReluAttr func(optionalAttr)
-
-// LeakyReluAlpha sets the optional alpha attribute to value.
-// If not specified, defaults to 0.2
-func LeakyReluAlpha(value float32) LeakyReluAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// Computes rectified linear: `max(features, features * alpha)`.
-func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+//
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	opspec := tf.OpSpec{
+		Type: "TensorListPushBack",
+		Input: []tf.Input{
+			input_handle, tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
+
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified index.  If the
+//
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LeakyRelu",
+		Type: "StagePeek",
 		Input: []tf.Input{
-			features,
+			index,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Performs a padding as a preprocess during a convolution.
-//
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
-		Input: []tf.Input{
-			input, paddings, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
-//
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the permuted vector/tensor in the destination data format given the
-//
-// one in the source data format.
-//
-// Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
-//
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
-//
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
-//
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fetches multiple values from infeed as an XLA tuple.
-//
-// Arguments:
-//	dtypes: The element types of each element in `outputs`.
-//	shapes: The shapes of each tensor in `outputs`.
-//
-// Returns A list of tensors that will be provided using the infeed mechanism.
-func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeueTuple",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
 	var idx int
 	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("InfeedDequeueTuple", err)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
 		return
 	}
-	return outputs
+	return values
 }
 
-// Writes the given dataset to the given file using the TFRecord format.
+// Computes natural logarithm of (1 + x) element-wise.
 //
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//
-// Returns the created operation.
-func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetToTFRecord",
+		Type: "Log1p",
 		Input: []tf.Input{
-			input_dataset, filename, compression_type,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
 
-// MutableHashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["max_enqueued_batches"] = value
+	}
+}
+
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+func BatchContainer(value string) BatchAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
+// BatchSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+func BatchSharedName(value string) BatchAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["batching_queue"] = value
 	}
 }
 
-// Creates an empty hash table.
+// Batches all input tensors nondeterministically.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
+//
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "Batch",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			tf.OutputList(in_tensors),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
-
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
-//
-// In detail, with the default NHWC format,
-//
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
-//
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
-
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
-//
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
-	if scope.Err() != nil {
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
-		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
 }
 
 // FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
@@ -10591,142 +10611,44 @@ func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Checks whether a tree ensemble has been initialized.
+// Computes sigmoid of `x` element-wise.
 //
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble resouce.
-//
-// Returns output boolean on whether it is initialized or not.
-func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Output) (is_initialized tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsBoostedTreesEnsembleInitialized",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			tree_ensemble_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gradients for batch normalization.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// The hash function is deterministic on the content of the string within the
+// process.
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
-		Input: []tf.Input{
-			t, m, v, gamma, backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Quantized Batch normalization.
-//
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
-//
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
-		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
-//
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
-//
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+//	num_buckets: The number of buckets.
 //
 // Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
 			string_tensor,
 		},
@@ -10736,113 +10658,37 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 	return op.Output(0)
 }
 
-// SpaceToBatch for 4-D tensors of type T.
+// Batch normalization.
 //
-// This is a legacy version of the more general SpaceToBatchND.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-// More specifically, this op outputs a copy of the input tensor where values from
-// the `height` and `width` dimensions are moved to the `batch` dimension. After
-// the zero-padding, both `height` and `width` of the input must be divisible by the
-// block size.
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, depth]`.
-//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-//   the padding of the input with zeros across the spatial dimensions as follows:
-//
-//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-//
-//   The effective spatial dimensions of the zero-padded input tensor will be:
-//
-//       height_pad = pad_top + height + pad_bottom
-//       width_pad = pad_left + width + pad_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` in the height and
-//     width dimensions are rearranged into the batch dimension at each location.
-//   * The batch of the output tensor is `batch * block_size * block_size`.
-//   * Both height_pad and width_pad must be divisible by block_size.
-//
-// The shape of the output will be:
-//
-//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//      depth]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-//
-func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "SpaceToBatch",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			input, paddings,
+			t, m, v, beta, gamma,
 		},
 		Attrs: attrs,
 	}
@@ -10850,136 +10696,29 @@ func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size
 	return op.Output(0)
 }
 
-// DecodePaddedRawAttr is an optional argument to DecodePaddedRaw.
-type DecodePaddedRawAttr func(optionalAttr)
-
-// DecodePaddedRawLittleEndian sets the optional little_endian attribute to value.
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
-// value: Whether the input `input_bytes` is in little-endian order. Ignored for
-// `out_type` values that are stored in a single byte, like `uint8`
-// If not specified, defaults to true
-func DecodePaddedRawLittleEndian(value bool) DecodePaddedRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
 //
 // Arguments:
-//	input_bytes: Tensor of string to be decoded.
-//	fixed_length: Length in bytes for each element of the decoded output. Must be a multiple
-// of the size of the output type.
+//
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
 //
 //
-// Returns A Tensor with one more dimension than the input `bytes`. The added dimension
-// will have size equal to the length of the elements of `bytes` divided by the
-// number of bytes to represent `out_type`.
-func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output, out_type tf.DataType, optional ...DecodePaddedRawAttr) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodePaddedRaw",
+		Type: "QuantizedBiasAdd",
 		Input: []tf.Input{
-			input_bytes, fixed_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The shape of the elements of the given list, as a tensor.
-//
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
-	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
-type UniformCandidateSamplerAttr func(optionalAttr)
-
-// UniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
 		Attrs: attrs,
 	}
@@ -10987,196 +10726,140 @@ func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int6
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
-//
-// Arguments:
-//	tensor: A Tensor of type `T`.
-//
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
-		Input: []tf.Input{
-			tensor,
-		},
+		Type: "Fact",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// SkipgramWindowSize sets the optional window_size attribute to value.
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["window_size"] = value
+		m["data_format"] = value
 	}
 }
 
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
 
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
+		m["data_format"] = value
 	}
 }
 
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
+// Performs average pooling on the input.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11185,9 +10868,185 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueWithCountsV2Attr is an optional argument to UniqueWithCountsV2.
+type UniqueWithCountsV2Attr func(optionalAttr)
+
+// UniqueWithCountsV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsV2OutIdx(value tf.DataType) UniqueWithCountsV2Attr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements along an axis of a tensor.
+//
+// This operation either returns a tensor `y` containing unique elements
+// along the `axis` of a tensor. The returned unique elements is sorted
+// in the same order as they occur along `axis` in `x`.
+// This operation also returns a tensor `idx` and a tensor `count`
+// that are the same size as the number of the elements in `x` along the
+// `axis` dimension. The `idx` contains the index in the unique output `y`
+// and the `count` contains the count in the unique output `y`.
+// In other words, for an `1-D` tensor `x` with `axis = None:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 0`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx, count = unique_with_counts(x, axis=0)
+// y ==> [[1, 0, 0],
+//        [2, 0, 0]]
+// idx ==> [0, 0, 1]
+// count ==> [2, 1]
+// ```
+//
+// For an `2-D` tensor `x` with `axis = 1`:
+//
+// ```
+// # tensor 'x' is [[1, 0, 0],
+// #                [1, 0, 0],
+// #                [2, 0, 0]]
+// y, idx, count = unique_with_counts(x, axis=1)
+// y ==> [[1, 0],
+//        [1, 0],
+//        [2, 0]]
+// idx ==> [0, 1, 1]
+// count ==> [1, 2]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int32` (default: None). The axis of the Tensor to
+// find the unique elements.
+//
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.A 1-D Tensor. The count of each value of x in the output y.
+func UniqueWithCountsV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueWithCountsV2Attr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCountsV2",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NoOp",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
@@ -11197,86 +11056,40 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 	}
 	var idx int
 	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
 		return
 	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+	return output
 }
 
-// DecodeWavAttr is an optional argument to DecodeWav.
-type DecodeWavAttr func(optionalAttr)
+// StringLengthAttr is an optional argument to StringLength.
+type StringLengthAttr func(optionalAttr)
 
-// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
+// StringLengthUnit sets the optional unit attribute to value.
 //
-// value: Number of sample channels wanted.
-// If not specified, defaults to -1
-func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
+// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
+// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
+// encoded Unicode code points in each string).  Results are undefined
+// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
+// valid UTF-8.
+// If not specified, defaults to "BYTE"
+func StringLengthUnit(value string) StringLengthAttr {
 	return func(m optionalAttr) {
-		m["desired_channels"] = value
+		m["unit"] = value
 	}
 }
 
-// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
+// String lengths of `input`.
 //
-// value: Length of audio requested.
-// If not specified, defaults to -1
-func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_samples"] = value
-	}
-}
-
-// Decode a 16-bit PCM WAV file to a float tensor.
-//
-// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
-//
-// When desired_channels is set, if the input contains fewer channels than this
-// then the last channel will be duplicated to give the requested number, else if
-// the input has more channels than requested then the additional channels will be
-// ignored.
-//
-// If desired_samples is set, then the audio will be cropped or padded with zeroes
-// to the requested length.
-//
-// The first output contains a Tensor with the content of the audio samples. The
-// lowest dimension will be the number of channels, and the second will be the
-// number of samples. For example, a ten-sample-long stereo WAV file should give an
-// output shape of [10, 2].
+// Computes the length of each string given in the input tensor.
 //
 // Arguments:
-//	contents: The WAV-encoded audio, usually from a file.
+//	input: The string for which to compute the length.
 //
-// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
-func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
+// Returns Integer tensor that has the same shape as `input`. The output contains the
+// element-wise string lengths of `input`.
+func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11285,14 +11098,2213 @@ func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (aud
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeWav",
+		Type: "StringLength",
 		Input: []tf.Input{
-			contents,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+//
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+//
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Betainc",
+		Input: []tf.Input{
+			a, b, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pattern": pattern}
+	opspec := tf.OpSpec{
+		Type: "StaticRegexFullMatch",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for SparseSegmentSqrtN.
+//
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StageAttr is an optional argument to Stage.
+type StageAttr func(optionalAttr)
+
+// StageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageCapacity(value int64) StageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageMemoryLimit sets the optional memory_limit attribute to value.
+//
+// value: The maximum number of bytes allowed for Tensors in the Staging Area.
+// If > 0, inserts will block until sufficient space is available.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageMemoryLimit(value int64) StageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func StageContainer(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func StageSharedName(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage values similar to a lightweight Enqueue.
+//
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
+//
+// Arguments:
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+// Returns the created operation.
+func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Stage",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of the sigmoid of `x` wrt its input.
+//
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	bytes: All the elements must have the same length.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeRaw",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+//
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Digamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGammaGrad",
+		Input: []tf.Input{
+			alpha, sample,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
+//
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
+//
+// Arguments:
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffle",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomStandardNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
+type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+
+// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+//
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
+//
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
+
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
+//
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
+//
+// Arguments:
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
+//
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComputeAccidentalHits",
+		Input: []tf.Input{
+			true_classes, sampled_candidates,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
+type NonDeterministicIntsAttr func(optionalAttr)
+
+// NonDeterministicIntsDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_INT64
+func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Non-deterministically generates some integers.
+//
+// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//
+// Returns Non-deterministic integer values with specified shape.
+func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonDeterministicInts",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Advance the counter of a counter-based RNG.
+//
+// The state of the RNG after
+// `rng_skip(n)` will be the same as that after `stateful_uniform([n])`
+// (or any other distribution). The actual increment added to the
+// counter is an unspecified implementation detail.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	delta: The amount of advancement.
+//
+// Returns the created operation.
+func RngSkip(scope *Scope, resource tf.Output, algorithm tf.Output, delta tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RngSkip",
+		Input: []tf.Input{
+			resource, algorithm, delta,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatefulTruncatedNormalAttr is an optional argument to StatefulTruncatedNormal.
+type StatefulTruncatedNormalAttr func(optionalAttr)
+
+// StatefulTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulTruncatedNormalDtype(value tf.DataType) StatefulTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulTruncatedNormal",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
+type StatefulStandardNormalV2Attr func(optionalAttr)
+
+// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulStandardNormalV2",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
+//
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
+//
+// ```python
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
+//
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
+//
+// Arguments:
+//	mutex: The mutex resource to lock.
+//
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexLock",
+		Input: []tf.Input{
+			mutex,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Shuffle dimensions of x according to a permutation and conjugate the result.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConjugateTranspose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradWithArgmaxAttr is an optional argument to MaxPoolGradWithArgmax.
+type MaxPoolGradWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradWithArgmaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMax",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Subtracts a value from the current value of a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignSubVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns 0 if x == 0, and x / y otherwise, elementwise.
+func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xdivy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Assigns a new value to a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
+//
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNGradBias sets the optional bias attribute to value.
+//
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
+//
+// Arguments:
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRNGrad",
+		Input: []tf.Input{
+			input_grads, input_image, output_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
+
+// UnbatchGradContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradContainer(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Gradient of Unbatch.
+//
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
+//
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnbatchGrad",
+		Input: []tf.Input{
+			original_input, batch_index, grad, id,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAdd",
+		Input: []tf.Input{
+			value, bias,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restores tensors from a V2 checkpoint.
+//
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
+//
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+//
+// Arguments:
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
+//
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	opspec := tf.OpSpec{
+		Type: "RestoreV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
+}
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
+type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, ms, mom, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset with a range of values. Corresponds to python's xrange.
+//
+// Arguments:
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
+//
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RangeDataset",
+		Input: []tf.Input{
+			start, stop, step,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
+
+// UnpackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+//
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
+//
+// Arguments:
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+//
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num": num}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unpack",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
+	}
+	return output
+}
+
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
+//
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
+//
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
+//
+// Arguments:
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a constant tensor on the host. Only for writing C++ tests.
+//
+// Arguments:
+//	value: Attr `value` is the tensor to return.
+//
+func HostConst(scope *Scope, value tf.Tensor, dtype tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value": value, "dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "HostConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates and returns an empty tensor list.
+//
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
+//
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "EmptyTensorList",
+		Input: []tf.Input{
+			element_shape, max_num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TryRpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
+
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// FIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FIFOQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
+}
+
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
+//
+//
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TakeDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
+//
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
+//
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
+	}
+}
+
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// ExtractGlimpseNoise sets the optional noise attribute to value.
+//
+// value: indicates if the noise should `uniform`, `gaussian`, or
+// `zero`. The default is `uniform` which means the the noise type
+// will be decided by `uniform_noise`.
+// If not specified, defaults to "uniform"
+func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
+//
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
+//
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractGlimpse",
+		Input: []tf.Input{
+			input, size, offsets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
+type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsPerChannelNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+//
+// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+// to 'outputs' tensor of same shape as `inputs`.
+//
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsPerChannel",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormV3Attr is an optional argument to FusedBatchNormV3.
+type FusedBatchNormV3Attr func(optionalAttr)
+
+// FusedBatchNormV3Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV3Epsilon(value float32) FusedBatchNormV3Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormV3DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV3DataFormat(value string) FusedBatchNormV3Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormV3IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV3IsTraining(value bool) FusedBatchNormV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.A 1D Tensor for some intermediate results, to be reused in the gradient
+// computation for better efficiency.
+func FusedBatchNormV3(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV3Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, reserve_space_3 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormV3",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5)
 }
 
 // ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
@@ -11503,704 +13515,113 @@ func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Outp
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
-
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// Store the input tensor in the state of the current session.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//	value: The tensor to be stored.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
-type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsPerChannelNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
-//
-// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-// to 'outputs' tensor of same shape as `inputs`.
-//
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-//
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannel",
-		Input: []tf.Input{
-			inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
-
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
-//
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// Decompress strings.
-//
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
-//
-// Arguments:
-//	bytes: A Tensor of string which is compressed.
-//
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 11, 3, 10, 9, 6, 7, 12]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// # ==> [[5, 5, 5, 5],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "GetSessionHandle",
 		Input: []tf.Input{
-			data, segment_ids,
+			value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gather ragged slices from `params` axis `0` according to `indices`.
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
+
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// Outputs a `RaggedTensor` output composed from `output_dense_values` and
-// `output_nested_splits`, such that:
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
 //
-// ```python
-// output.shape = indices.shape + params.shape[1:]
-// output.ragged_rank = indices.shape.ndims + params.ragged_rank
-// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-// ```
-//
-// where
-//
-// * `params =
-//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
-//    provides the values that should be gathered.
-// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
-//    values should be gathered.
-// * `output =
-//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
-//    is the output tensor.
-//
-// (Note: This c++ op is used to implement the higher-level python
-// `tf.ragged.gather` op, which also supports ragged indices.)
-//
-//
-// Arguments:
-//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// `params` RaggedTensor input.
-//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to flat_values, so dense_values is the
-// deprecated name.
-//	indices: Indices in the outermost dimension of `params` of the values that should be
-// gathered.
-//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
-// this number of `row_splits` tensors. This value should equal
-// `indices.shape.ndims + params.ragged_rank - 1`.
-//
-// Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
-func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
 	}
-	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
-	opspec := tf.OpSpec{
-		Type: "RaggedGather",
-		Input: []tf.Input{
-			tf.OutputList(params_nested_splits), params_dense_values, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedGather", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
 }
 
-// Constructs a tensor by tiling a given tensor.
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
-//
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tile",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Encodes a `RaggedTensor` into a `variant` Tensor.
-//
-//
-// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
-// `batched_input` is True, then input `RaggedTensor` is unbatched along the
-// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
-// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
-// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
-// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
-// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
-// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
-// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
-// corresponding decoding logic.
-//
-//
-// Arguments:
-//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
-// `RaggedTensor`.
-//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
-//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
-//
-// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
-func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"batched_input": batched_input}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToVariant",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
-
-// UnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
 //
-// REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// UnstageContainer sets the optional container attribute to value.
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// UnstageSharedName sets the optional shared_name attribute to value.
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
 // If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op is similar to a lightweight Dequeue.
+// A queue that produces elements in first-in first-out order.
 //
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unstage",
+		Type: "PaddingFIFOQueueV2",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
-		return
-	}
-	return values
-}
-
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
-//
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
-//
-// Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
-//
-// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
-//
-// If `k` varies dynamically, use `TopKV2` below.
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"k": k}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TopK",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// RaggedRangeAttr is an optional argument to RaggedRange.
-type RaggedRangeAttr func(optionalAttr)
-
-// RaggedRangeTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
-//
-// ```python
-// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
-// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// >>> print result.eval().tolist()
-// [[2],               # result[0] = range(2, 3)
-//  [],                # result[1] = range(5, 5)
-//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
-// ```
-//
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
-//
-// Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
-//
-// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedRange",
-		Input: []tf.Input{
-			starts, limits, deltas,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the bias tensor will be added to the last dimension
@@ -12210,23 +13631,23 @@ type BiasAddAttr func(optionalAttr)
 // The tensor will be added to "in_channels", the third-to-the-last
 //     dimension.
 // If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Adds `bias` to `value`.
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	out_backprop: Any number of dimensions.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12235,9 +13656,9 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			value, bias,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -12245,190 +13666,49 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 	return op.Output(0)
 }
 
-// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
-type StatefulStandardNormalV2Attr func(optionalAttr)
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
 
-// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulStandardNormalV2",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Use RandomPoissonV2 instead.
-//
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
-
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
-//
-// Arguments:
-//	value: The tensor to be shuffled.
-//
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum - lr * grad
+// var += accum
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12437,809 +13717,28 @@ func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, li
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "ResourceApplyKerasMomentum",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-//     Updates specified rows with values in `v`.
-//
-//     Computes `x[i, :] = v; return x`.
+// Computes fingerprints of the input strings.
 //
 // Arguments:
-//	x: A tensor of type `T`.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//	input: vector of strings to compute fingerprints on.
 //
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InplaceUpdate",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x AND y element-wise.
-//
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradWithArgmaxAttr is an optional argument to MaxPoolGradWithArgmax.
-type MaxPoolGradWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
-//
-// value: Whether to include batch dimension in flattened index of `argmax`.
-// If not specified, defaults to false
-func MaxPoolGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradWithArgmaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
-type QuantizedInstanceNormAttr func(optionalAttr)
-
-// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
-//
-// value: If True, `given_y_min` and `given_y_min`
-// and `given_y_max` are used as the output range. Otherwise,
-// the implementation computes the output range.
-// If not specified, defaults to false
-func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["output_range_given"] = value
-	}
-}
-
-// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
-//
-// value: Output in `y_min` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_min"] = value
-	}
-}
-
-// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
-//
-// value: Output in `y_max` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_max"] = value
-	}
-}
-
-// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
-//
-// value: A small float number to avoid dividing by 0.
-// If not specified, defaults to 1e-05
-func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["variance_epsilon"] = value
-	}
-}
-
-// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
-//
-// value: Minimum value of `y_max - y_min`
-// If not specified, defaults to 0.001
-func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["min_separation"] = value
-	}
-}
-
-// Quantized Instance normalization.
-//
-// Arguments:
-//	x: A 4D input Tensor.
-//	x_min: The value represented by the lowest quantized input.
-//	x_max: The value represented by the highest quantized input.
-//
-// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
-func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedInstanceNorm",
-		Input: []tf.Input{
-			x, x_min, x_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StridedSliceAttr is an optional argument to StridedSlice.
-type StridedSliceAttr func(optionalAttr)
-
-// StridedSliceBeginMask sets the optional begin_mask attribute to value.
-//
-// value: a bitmask where a bit i being 1 means to ignore the begin
-// value and instead use the largest interval possible. At runtime
-// begin[i] will be replaced with `[0, n-1)` if `stride[i] > 0` or
-// `[-1, n-1]` if `stride[i] < 0`
-// If not specified, defaults to 0
-func StridedSliceBeginMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceEndMask sets the optional end_mask attribute to value.
-//
-// value: analogous to `begin_mask`
-// If not specified, defaults to 0
-func StridedSliceEndMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
-//
-// value: a bitmask where bit `i` being 1 means the `i`th
-// position is actually an ellipsis. One bit at most can be 1.
-// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
-// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
-// implicitly creates as many range specifications as necessary to fully
-// specify the sliced range for every dimension. For example for a 4-dimensional
-// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
-// If not specified, defaults to 0
-func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
-//
-// value: a bitmask where bit `i` being 1 means the `i`th
-// specification creates a new shape 1 dimension. For example
-// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
-// If not specified, defaults to 0
-func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-//
-// value: a bitmask where bit `i` implies that the `i`th
-// specification should shrink the dimensionality. begin and end
-// must imply a slice of size 1 in the dimension. For example in
-// python one might do `foo[:, 3, :]` which would result in
-// `shrink_axis_mask` being 2.
-// If not specified, defaults to 0
-func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Return a strided slice from `input`.
-//
-// Note, most python users will want to use the Python `Tensor.__getitem__`
-// or `Variable.__getitem__` rather than this op directly.
-//
-// The goal of this op is to produce a new tensor with a subset of
-// the elements from the `n` dimensional `input` tensor. The subset is chosen using
-// a sequence of `m` sparse range specifications encoded into the arguments
-// of this function. Note, in some cases
-// `m` could be equal to `n`, but this need not be the case. Each
-// range specification entry can be one of the following:
-//
-// - An ellipsis (...). Ellipses are used to imply zero or more
-//   dimensions of full-dimension selection and are produced using
-//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
-//
-// - A new axis. This is used to insert a new shape=1 dimension and is
-//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
-//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
-//
-//
-// - A range `begin:end:stride`. This is used to specify how much to choose from
-//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-//   which represents the index of the first value to select while `end` represents
-//   the index of the last value to select. The number of values selected in each
-//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
-//   the second to last. `begin_mask` controls whether to replace the explicitly
-//   given `begin` with an implicit effective value of `0` if `stride > 0` and
-//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-//   required to create the largest open interval. For example, given a shape
-//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-//   first dimension of a tensor while dropping the last two (in the original
-//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-//
-// - A single index. This is used to keep only elements that have a given
-//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
-//   `shrink_axis_mask`.
-//
-// Each conceptual range specification is encoded in the op's argument. This
-// encoding is best understand by considering a non-trivial example. In
-// particular,
-// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
-//
-// ```
-// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-// end = [2, 4, x, x, -3, x]
-// strides = [1, 1, x, x, -1, 1]
-// begin_mask = 1<<4 | 1 << 5 = 48
-// end_mask = 1<<5 = 32
-// ellipsis_mask = 1<<3 = 8
-// new_axis_mask = 1<<2 4
-// shrink_axis_mask = 1<<0
-// ```
-//
-// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-// the slice becomes (2, 1, 5, 5, 2, 5).
-// Let us walk step by step through each argument specification.
-//
-// 1.  The first argument in the example slice is turned into `begin = 1` and
-// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-// also set the appropriate bit in `shrink_axis_mask`.
-//
-// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-// zero bits contributed.
-//
-// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-// dimension in the final shape. Dummy values are contributed to begin,
-// end and stride, while the new_axis_mask bit is set.
-//
-// 4. `...` grab the full ranges from as many dimensions as needed to
-// fully specify a slice for every dimension of the input shape.
-//
-// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-// with a dimension that has shape `s` is converted to a positive index
-// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-// is done internally so begin, end and strides receive x, -3, and -1.
-// The appropriate begin_mask bit is set to indicate the start range is the
-// full range (ignoring the x).
-//
-// 6. `:` indicates that the entire contents of the corresponding dimension
-// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-// `end_mask` are also set.
-//
-// *Requirements*:
-//   `0 != strides[i] for i in [0, m)`
-//   `ellipsis_mask must be a power of two (only one ellipsis)`
-//
-// Arguments:
-//
-//	begin: `begin[k]` specifies the offset into the `k`th range specification.
-// The exact dimension this corresponds to will be determined by context.
-// Out-of-bounds values will be silently clamped. If the `k`th bit of
-// `begin_mask` then `begin[k]` is ignored and the full range of the
-// appropriate dimension is used instead. Negative values causes indexing
-// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
-//	end: `end[i]` is like `begin` with the exception that `end_mask` is
-// used to determine full ranges.
-//	strides: `strides[i]` specifies the increment in the `i`th specification
-// after extracting a given element. Negative indices will reverse
-// the original order. Out or range values are
-// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
-func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StridedSlice",
-		Input: []tf.Input{
-			input, begin, end, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
-//
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
-//
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
-//
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Assert",
-		Input: []tf.Input{
-			condition, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
-type StatefulStandardNormalAttr func(optionalAttr)
-
-// StatefulStandardNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. This op is deprecated in favor of op 'StatefulStandardNormalV2'
-//
-// DEPRECATED at GraphDef version 29: Use StatefulStandardNormalV2 instead
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	shape: The shape of the output tensor.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulStandardNormal",
-		Input: []tf.Input{
-			resource, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
-//
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
-		Input: []tf.Input{
-			features, max_value, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
-type CudnnRNNBackpropV2Attr func(optionalAttr)
-
-// CudnnRNNBackpropV2RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropV2RnnMode(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV2InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropV2InputMode(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropV2Direction(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Dropout(value float32) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Seed(value int64) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNBackpropV2Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Seed2(value int64) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Backprop step of CudnnRNN.
-//
-// Compute the backprop of both data and weights in a RNN. Takes an extra
-//     "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
-//     cudnnRNNAlgo_t and cudnnMathType_t.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in the forward operation.
-// host_reserved: The same host_reserved produced in the forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV2Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackpropV2",
-		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
-type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load FTRL embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParameters",
-		Input: []tf.Input{
-			parameters, accumulators, linears,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Inverse 2D fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "SdcaFprint",
 		Input: []tf.Input{
 			input,
 		},
@@ -13248,1528 +13747,49 @@ func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
+// Transforms a Tensor into a serialized TensorProto proto.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	tensor: A Tensor of type `T`.
+//
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "SerializeTensor",
 		Input: []tf.Input{
-			x,
+			tensor,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Splits a tensor into a list.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
+// list[i] corresponds to lengths[i] tensors from the input tensor.
+// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
 //
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+// tensor: The input tensor.
+// element_shape: A shape compatible with that of elements in the tensor.
+// lengths: Vector of sizes of the 0th dimension of tensors in the list.
+// output_handle: The list.
+func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
+		Type: "TensorListSplit",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
+			tensor, element_shape, lengths,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
-
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["Targmax"] = value
-	}
-}
-
-// MaxPoolWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
-//
-// value: Whether to include batch dimension in flattened index of `argmax`.
-// If not specified, defaults to false
-func MaxPoolWithArgmaxIncludeBatchInIndex(value bool) MaxPoolWithArgmaxAttr {
-	return func(m optionalAttr) {
-		m["include_batch_in_index"] = value
-	}
-}
-
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index:
-// `(y * width + x) * channels + c` if `include_batch_in_index` is False;
-// `((b * height + y) * width + x) * channels + c` if `include_batch_in_index` is True.
-//
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Creates a dataset that changes the batch size.
-//
-// Creates a dataset that changes the batch size of the dataset to current batch
-// size // num_workers.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_workers: A scalar representing the number of workers to distribute this batch across. As
-// a result of this transformation the current batch size would end up being
-// divided  by this parameter.
-//
-//
-func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalRebatchDataset",
-		Input: []tf.Input{
-			input_dataset, num_workers,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Advance the counter of a counter-based RNG.
-//
-// The state of the RNG after
-// `rng_skip(n)` will be the same as that after `stateful_uniform([n])`
-// (or any other distribution). The actual increment added to the
-// counter is an unspecified implementation detail.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	delta: The amount of advancement.
-//
-// Returns the created operation.
-func RngSkip(scope *Scope, resource tf.Output, algorithm tf.Output, delta tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RngSkip",
-		Input: []tf.Input{
-			resource, algorithm, delta,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizedDepthwiseConv2DAttr is an optional argument to QuantizedDepthwiseConv2D.
-type QuantizedDepthwiseConv2DAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QINT32
-func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
-
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of average pooling function.
-//
-// Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutexV2Attr is an optional argument to MutexV2.
-type MutexV2Attr func(optionalAttr)
-
-// MutexV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this variable is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutexV2Container(value string) MutexV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutexV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this variable is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func MutexV2SharedName(value string) MutexV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a Mutex resource that can be locked by `MutexLock`.
-//
-// Returns The mutex resource.
-func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutexV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the diagonal part of the tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-//
-// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-//
-// For example:
-//
-// ```
-// # 'input' is [[1, 0, 0, 0]
-//               [0, 2, 0, 0]
-//               [0, 0, 3, 0]
-//               [0, 0, 0, 4]]
-//
-// tf.diag_part(input) ==> [1, 2, 3, 4]
-// ```
-//
-// Arguments:
-//	input: Rank k tensor where k is even and not zero.
-//
-// Returns The extracted diagonal.
-func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DiagPart",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
-		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Divides sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] /= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] /= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterDiv",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
-type StatefulUniformFullIntAttr func(optionalAttr)
-
-// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//
-// Returns Random values with specified shape.
-func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniformFullInt",
-		Input: []tf.Input{
-			resource, algorithm, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks whether a resource handle-based variable has been initialized.
-//
-// Arguments:
-//	resource: the input resource handle.
-//
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
-		Input: []tf.Input{
-			resource,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InTopKV2",
-		Input: []tf.Input{
-			predictions, targets, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds sparse `updates` to an existing tensor according to `indices`.
-//
-// This operation creates a new tensor by adding sparse `updates` to the passed
-// in `tensor`.
-// This operation is very similar to `tf.scatter_nd_add`, except that the updates
-// are added onto an existing tensor (as opposed to a variable). If the memory
-// for the existing tensor cannot be re-used, a copy is made and updated.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of tensor_scatter_add is to add individual elements to a
-// tensor by index. For example, say we want to add 4 elements in a rank-1
-// tensor with 8 elements.
-//
-// In Python, this scatter add operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     tensor = tf.ones([8], dtype=tf.int32)
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [1, 12, 1, 11, 10, 1, 1, 13]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// In Python, this scatter add operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     tensor = tf.ones([4, 4, 4])
-//     updated = tf.tensor_scatter_add(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
-//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	tensor: Tensor to copy/update.
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//
-// Returns A new tensor copied from tensor and updates added according to the indices.
-func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorScatterAdd",
-		Input: []tf.Input{
-			tensor, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
-//
-// This operation returns the same result as the C++ std::nextafter function.
-//
-// It can also return a subnormal number.
-//
-// @compatibility(cpp)
-// Equivalent to C++ std::nextafter function.
-// @end_compatibility
-func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NextAfter",
-		Input: []tf.Input{
-			x1, x2,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
-//
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
-//
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapClearAttr is an optional argument to OrderedMapClear.
-type OrderedMapClearAttr func(optionalAttr)
-
-// OrderedMapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearContainer(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
-
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Enqueues zero or more tuples of one or more tensors in the given queue.
-//
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
-//
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
-//
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
-		Input: []tf.Input{
-			handle, tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
-//
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
-//
-// See also `Save`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveSlices",
-		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Gather slices from `params` into a Tensor with shape specified by `indices`.
-//
-// `indices` is an K-dimensional integer tensor, best thought of as a
-// (K-1)-dimensional tensor of indices into `params`, where each element defines a
-// slice of `params`:
-//
-//     output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]
-//
-// Whereas in `tf.gather` `indices` defines slices into the first
-// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
-//
-// The last dimension of `indices` can be at most the rank of
-// `params`:
-//
-//     indices.shape[-1] <= params.rank
-//
-// The last dimension of `indices` corresponds to elements
-// (if `indices.shape[-1] == params.rank`) or slices
-// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-// of `params`.  The output tensor has shape
-//
-//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
-//
-// Some examples below.
-//
-// Simple indexing into a matrix:
-//
-// ```python
-//     indices = [[0, 0], [1, 1]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = ['a', 'd']
-// ```
-//
-// Slice indexing into a matrix:
-//
-// ```python
-//     indices = [[1], [0]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['c', 'd'], ['a', 'b']]
-// ```
-//
-// Indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['a1', 'b1'], ['c1', 'd1']]]
-//
-//
-//     indices = [[0, 1], [1, 0]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['c0', 'd0'], ['a1', 'b1']]
-//
-//
-//     indices = [[0, 0, 1], [1, 0, 1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = ['b0', 'b1']
-// ```
-//
-// Batched indexing into a matrix:
-//
-// ```python
-//     indices = [[[0, 0]], [[0, 1]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['a'], ['b']]
-// ```
-//
-// Batched slice indexing into a matrix:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [[['c', 'd']], [['a', 'b']]]
-// ```
-//
-// Batched indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
-//               [[['a0', 'b0'], ['c0', 'd0']]]]
-//
-//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['c0', 'd0'], ['a1', 'b1']],
-//               [['a0', 'b0'], ['c1', 'd1']]]
-//
-//
-//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['b0', 'b1'], ['d0', 'c1']]
-// ```
-//
-// See also `tf.gather` and `tf.batch_gather`.
-//
-// Arguments:
-//	params: The tensor from which to gather values.
-//	indices: Index tensor.
-//
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GatherNd",
-		Input: []tf.Input{
-			params, indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that contains `rate` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	rate: A scalar representing the sample rate of elements from the `input_dataset`
-// that should be taken.
-//	seed: A scalar representing seed of random number generator.
-//	seed2: A scalar representing seed2 of random number generator.
-//
-//
-func SamplingDataset(scope *Scope, input_dataset tf.Output, rate tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SamplingDataset",
-		Input: []tf.Input{
-			input_dataset, rate, seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Eagerly executes a python function to compute func(input)->output. The
-//
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
-	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
-}
-
-// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
-//
-// For each entry in `x`, calculates the number of `1` (on) bits in the binary
-// representation of that entry.
-//
-// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-// `int32` or `int64` and perform the bitcount on the result, than to feed in
-// 8- or 16-bit inputs and then aggregate the resulting counts.
-func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "PopulationCount",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
-//
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
-		Input: []tf.Input{
-			shape, minval, maxval,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
-type SdcaOptimizerV2Attr func(optionalAttr)
-
-// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
-//
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
-	return func(m optionalAttr) {
-		m["adaptive"] = value
-	}
-}
-
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
-//
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
-//
-// Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
-//
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaOptimizerV2",
-		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizerV2", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizerV2", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
-}
-
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
-
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `Tensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
-		Input: []tf.Input{
-			set1, set2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
-}
-
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Converts the given string representing a handle to an iterator to a resource.
-//
-// Arguments:
-//	string_handle: A string representation of the given handle.
-//
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
-		Input: []tf.Input{
-			string_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SetSizeAttr is an optional argument to SetSize.
 type SetSizeAttr func(optionalAttr)
 
@@ -14817,46 +13837,256 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["errors"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "UnicodeDecodeWithOffsets",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
+//
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
+//
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
+//
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
+//
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
+//
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRows",
+		Input: []tf.Input{
+			indices, values, dense_shape, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
+//
+// the source data format.
+//
+// Arguments:
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
+//
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatDimMap",
+		Input: []tf.Input{
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -14864,318 +14094,224 @@ func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize
 	return op.Output(0)
 }
 
-// Strip leading and trailing whitespaces from the Tensor.
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
 //
 // Arguments:
-//	input: A string `Tensor` of any shape.
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
 //
-// Returns A string `Tensor` of the same shape as the input.
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgsGradient",
+		Input: []tf.Input{
+			gradients, inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns true if and only if the given Optional variant has a value.
+func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringStrip",
+		Type: "OptionalHasValue",
 		Input: []tf.Input{
-			input,
+			optional,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// RegexReplaceAttr is an optional argument to RegexReplace.
+type RegexReplaceAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
+// value: If True, the replacement is global (that is, all matches of the `pattern` regular
+// expression in each input string are rewritten), otherwise the `rewrite`
+// substitution is only made for the first `pattern` match.
+// If not specified, defaults to true
+func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces matches of the `pattern` regular expression in `input` with the
+// replacement string provided in `rewrite`.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to be matched in the `input` strings.
+//	rewrite: The rewrite string to be substituted for the `pattern` expression where it is
+// matched in the `input` strings.
+//
+// Returns The text after applying pattern match and rewrite substitution.
+func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexReplace",
+		Input: []tf.Input{
+			input, pattern, rewrite,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` read by this op.
 // If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
 // If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
 //
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
-//
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
-//
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
-		Input: []tf.Input{
-			value, bias,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a tf.Example proto (as a string) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
-		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
-
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
@@ -15211,22 +14347,229 @@ func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
-// Returns the item in the list with the given index.
+// Converts a flat index or array of flat indices into a tuple of
 //
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
+// coordinate arrays.
 //
+// @compatibility(numpy)
+// Equivalent to np.unravel_index
+// @end_compatibility
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+// Arguments:
+//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
+// flattened version of an array of dimensions dims.
+//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
+// indices.
+//
+// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+// same shape as the indices array.
+func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
+		Type: "UnravelIndex",
 		Input: []tf.Input{
-			input_handle, index, element_shape,
+			indices, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Applies softmax to a batched N-D `SparseTensor`.
+//
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+//
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
+//
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
+//
+// Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmax",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
+
+// UnicodeDecodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeTsplits(value tf.DataType) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// The gradient operator for the SparseSlice op.
+//
+// This op takes in the upstream gradient w.r.t. non-empty values of
+// the sliced `SparseTensor`, and outputs the gradients w.r.t.
+// the non-empty values of input `SparseTensor`.
+//
+// Arguments:
+//	backprop_val_grad: 1-D. The gradient with respect to
+// the non-empty values of the sliced `SparseTensor`.
+//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
+//	input_start: 1-D. tensor represents the start of the slice.
+//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
+//
+// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
+func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSliceGrad",
+		Input: []tf.Input{
+			backprop_val_grad, input_indices, input_start, output_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs a padding as a preprocess during a convolution.
+//
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "FusedPadConv2D",
+		Input: []tf.Input{
+			input, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -15234,49 +14577,97 @@ func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, el
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15285,9 +14676,9 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -15295,58 +14686,99 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
 
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// LRNGradBias sets the optional bias attribute to value.
-//
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["input_mode"] = value
 	}
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["direction"] = value
 	}
 }
 
-// LRNGradBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["dropout"] = value
 	}
 }
 
-// Gradients for Local Response Normalization.
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
 //
-// Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+// Compute the backprop of both data and weights in a RNN.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15355,213 +14787,51 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "CudnnRNNBackprop",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
-//
-// Input images and output images must be quantized types.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-//
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
-		Input: []tf.Input{
-			images, size, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
-type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve MDL Adagrad Light embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
-func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["field_delim"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15570,206 +14840,98 @@ func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NoOp",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
-//
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
-
-// AsStringScientific sets the optional scientific attribute to value.
-//
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["scientific"] = value
-	}
-}
-
-// AsStringShortest sets the optional shortest attribute to value.
-//
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
-
-// AsStringWidth sets the optional width attribute to value.
-//
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["width"] = value
-	}
-}
-
-// AsStringFill sets the optional fill attribute to value.
-//
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
-	return func(m optionalAttr) {
-		m["fill"] = value
-	}
-}
-
-// Converts each entry in the given tensor to strings.  Supports many numeric
-//
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "Elu",
 		Input: []tf.Input{
-			input,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
-//
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
-//
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
-//
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
-	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
-}
-
-// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
-func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
+// Returns the value stored in an Optional variant or raises an error if none exists.
+func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FilterByLastComponentDataset",
+		Type: "OptionalGetValue",
 		Input: []tf.Input{
-			input_dataset,
+			optional,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
-//
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
-//
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("OptionalGetValue", err)
+		return
+	}
+	return components
+}
+
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
+
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D max pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
 			input,
 		},
@@ -15779,27 +14941,193 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
-//
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			gradients, features,
+			a_indices, a_values, a_shape, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// Sets the index-th position of the list to contain the given tensor.
+//
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
+//
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSetItem",
+		Input: []tf.Input{
+			input_handle, index, item,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
+
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
 // If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
@@ -15866,28 +15194,1366 @@ func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf
 	return output_indices, output_values, output_shape
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
-type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
 
-// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// Computes the inverse permutation of a tensor.
+//
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
+//
+// For example:
+//
+// ```
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InvertPermutation",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PaddedBatchDatasetV2Attr is an optional argument to PaddedBatchDatasetV2.
+type PaddedBatchDatasetV2Attr func(optionalAttr)
+
+// PaddedBatchDatasetV2ParallelCopy sets the optional parallel_copy attribute to value.
+// If not specified, defaults to false
+func PaddedBatchDatasetV2ParallelCopy(value bool) PaddedBatchDatasetV2Attr {
+	return func(m optionalAttr) {
+		m["parallel_copy"] = value
+	}
+}
+
+// Creates a dataset that batches and pads `batch_size` elements from the input.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape, optional ...PaddedBatchDatasetV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PaddedBatchDatasetV2",
+		Input: []tf.Input{
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Deprecated. Use TensorArrayCloseV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+//
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToDense",
+		Input: []tf.Input{
+			sparse_indices, output_shape, sparse_values, default_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the argument of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
+//
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Angle",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
+//
+// Arguments:
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
+//
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadUpToV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle, num_records,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
+//
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// EuclideanNormAttr is an optional argument to EuclideanNorm.
+type EuclideanNormAttr func(optionalAttr)
+
+// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the euclidean norm of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EuclideanNorm",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produces a summary of any statistics recorded by the given statistics manager.
+func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the sum along sparse segments of a tensor.
+//
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
+// for an explanation of segments.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSumWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
+type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Load proximal Adagrad embedding parameters.
+// Retrieve proximal Adagrad embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
+
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+//
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces the match of pattern in input with rewrite.
+//
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expression.
+//
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StaticRegexReplace",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
+//
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
+//
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tree resource and returns a handle to it.
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be created.
+//	tree_config: Serialized proto string of the boosted_trees.Tree.
+//
+// Returns the created operation.
+func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestCreateTreeVariable",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
+type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve MDL Adagrad Light embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddV2",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
+
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+//
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+//
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+//
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
+//
+// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCLoss",
+		Input: []tf.Input{
+			inputs, labels_indices, labels_values, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes gradients for the exponential linear (Elu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that skips `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndRelu.
+type QuantizedDepthwiseConv2DWithBiasAndReluAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DWithBiasAndReluOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QINT32
+func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias and Relu.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBiasAndRelu(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2DWithBiasAndRelu",
+		Input: []tf.Input{
+			input, filter, bias, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "InTopK",
+		Input: []tf.Input{
+			predictions, targets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Add the quantile summaries to each quantile stream resource.
+//
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle, tf.OutputList(summaries),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Rolls the elements of a tensor along an axis.
+//
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
+//
+// For example:
+//
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+//
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
+//
+// Arguments:
+//
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
+//
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Roll",
+		Input: []tf.Input{
+			input, shift, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
+type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters with debug support.
 //
 // An op that loads optimization parameters into HBM for embedding. Must be
 // preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
@@ -15898,11 +16564,12 @@ func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmb
 // Arguments:
 //	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
 //	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
 //
 //
 //
 // Returns the created operation.
-func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15911,257 +16578,15 @@ func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
 		Input: []tf.Input{
-			parameters, accumulators,
+			parameters, accumulators, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
-
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
-//
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
-	return func(m optionalAttr) {
-		m["maxsplit"] = value
-	}
-}
-
-// Split elements of `source` based on `sep` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
-//
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
-//
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
-//
-// Note that the above mentioned behavior matches python's str.split.
-//
-// Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
-		Input: []tf.Input{
-			input, sep,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Determine the script codes of a given tensor of Unicode integer code points.
-//
-// This operation converts Unicode code points to script codes corresponding to
-// each code point. Script codes correspond to International Components for
-// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
-// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
-// match input shape.
-//
-// Arguments:
-//	input: A Tensor of int32 Unicode code points.
-//
-// Returns A Tensor of int32 script codes corresponding to each input code point.
-func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeScript",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// L2 Loss.
-//
-// Computes half the L2 norm of a tensor without the `sqrt`:
-//
-//     output = sum(t ** 2) / 2
-//
-// Arguments:
-//	t: Typically 2-D, but may have any dimensions.
-//
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "L2Loss",
-		Input: []tf.Input{
-			t,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
-type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adagrad embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that contains the unique elements of `input_dataset`.
-func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalUniqueDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Bitcasts a tensor from one type to another without copying data.
-//
-// Given a tensor `input`, this operation returns a tensor that has the same buffer
-// data as `input` with datatype `type`.
-//
-// If the input datatype `T` is larger than the output datatype `type` then the
-// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-//
-// If `T` is smaller than `type`, the operator requires that the rightmost
-// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-// [..., sizeof(`type`)/sizeof(`T`)] to [...].
-//
-// tf.bitcast() and tf.cast() work differently when real dtype is casted as a complex dtype
-// (e.g. tf.complex64 or tf.complex128) as tf.cast() make imaginary part 0 while tf.bitcast()
-// gives module error.
-// For example,
-//
-// Example 1:
-// ```python
-// >>> a = [1., 2., 3.]
-// >>> equality_bitcast = tf.bitcast(a,tf.complex128)
-// tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot bitcast from float to complex128: shape [3] [Op:Bitcast]
-// >>> equality_cast = tf.cast(a,tf.complex128)
-// >>> print(equality_cast)
-// tf.Tensor([1.+0.j 2.+0.j 3.+0.j], shape=(3,), dtype=complex128)
-// ```
-// Example 2:
-// ```python
-// >>> tf.bitcast(tf.constant(0xffffffff, dtype=tf.uint32), tf.uint8)
-// <tf.Tensor: ... shape=(4,), dtype=uint8, numpy=array([255, 255, 255, 255], dtype=uint8)>
-// ```
-// Example 3:
-// ```python
-// >>> x = [1., 2., 3.]
-// >>> y = [0., 2., 3.]
-// >>> equality= tf.equal(x,y)
-// >>> equality_cast = tf.cast(equality,tf.float32)
-// >>> equality_bitcast = tf.bitcast(equality_cast,tf.uint8)
-// >>> print(equality)
-// tf.Tensor([False True True], shape=(3,), dtype=bool)
-// >>> print(equality_cast)
-// tf.Tensor([0. 1. 1.], shape=(3,), dtype=float32)
-// >>> print(equality_bitcast)
-// tf.Tensor(
-// [[ 0 0 0 0]
-//  [ 0 0 128 63]
-//  [ 0 0 128 63]], shape=(3, 4), dtype=uint8)
-// ```
-//
-// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-// endian orderings will give different results.
-func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"type": type_}
-	opspec := tf.OpSpec{
-		Type: "Bitcast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
 type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
 
@@ -16218,163 +16643,66 @@ func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// Saves the input tensors to disk.
+//
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
+//
+// See also `SaveSlices`.
 //
 // Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "Save",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			filename, tensor_names, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the update to cached logits. It is designed to be used during training.
+// It traverses the trees starting from cached tree id and cached node id and
+// calculates the updates to be pushed to the cache.
+//
+// Arguments:
+//
+//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
+// tree of prediction.
+//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
+// node of prediction.
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Rank 2 Tensor containing logits update (with respect to cached
+// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
+func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesTrainingPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// Add the quantile summaries to each quantile stream resource.
-//
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
-type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load centered RMSProp embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
-//	ms: Value of ms used in the centered RMSProp optimization algorithm.
-//	mom: Value of mom used in the centered RMSProp optimization algorithm.
-//	mg: Value of mg used in the centered RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
-		Input: []tf.Input{
-			parameters, ms, mom, mg,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // LoadTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingADAMParametersGradAccumDebug.
@@ -16433,58 +16761,254 @@ func LoadTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, parameters tf.Ou
 	return scope.AddOperation(opspec)
 }
 
-// Computes square root of x element-wise.
+// Return a slice from 'input'.
 //
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
+//
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+//
+// Arguments:
+//
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sqrt",
+		Type: "Slice",
 		Input: []tf.Input{
-			x,
+			input, begin, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// Deserialize `SparseTensor` objects.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+//
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "Igammac",
 		Input: []tf.Input{
-			y, dy,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// if < 0, `scale * features` otherwise.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
+type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
+		Input: []tf.Input{
+			parameters,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Selu",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			features,
+			filename,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -16620,52 +17144,36 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// PrelinearizeAttr is an optional argument to Prelinearize.
+type PrelinearizeAttr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// PrelinearizeShape sets the optional shape attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["shape"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// PrelinearizeLayout sets the optional layout attribute to value.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+// value: A vector holding the requested layout in minor-to-major sequence. If a layout
+// attribute is passed but its values are all -1 the layout will be computed by
+// the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeLayout(value []int64) PrelinearizeAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["layout"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
-//
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+// An op which linearizes one Tensor value to an opaque variant tensor.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+//	input: A tensor that will be linearized.
+func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16674,133 +17182,357 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "Prelinearize",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
-type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
 //
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load SGD embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+// This is typically used by gradient computations for a broadcasting operation.
+func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	opspec := tf.OpSpec{
+		Type: "BroadcastGradientArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
+//
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
-		Input: []tf.Input{
-			parameters,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
-type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve centered RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
+		Type: "TFRecordReaderV2",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeManySparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TanhGrad",
+		Type: "SparseSegmentSqrtNWithNumSegments",
 		Input: []tf.Input{
-			y, dy,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
+type StatefulUniformFullIntAttr func(optionalAttr)
+
+// StatefulUniformFullIntDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatefulUniformFullIntDtype(value tf.DataType) StatefulUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulUniformFullInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformFullIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformFullInt",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
 // Arguments:
 //	input: 4-D with shape `[batch, in_height, in_width, depth]`.
@@ -16812,14 +17544,14 @@ func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 // Must be: `[1, rate_height, rate_width, 1]`.
 //	padding: The type of padding algorithm to use.
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
 			input, filter, out_backprop,
 		},
@@ -16829,236 +17561,45 @@ func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, o
 	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
-type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// Multiplies sparse updates into the variable referenced by `resource`.
 //
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve RMSProp embedding parameters with debug support.
+// This operation computes
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
 //
-// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Merges summaries.
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
 //
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "ResourceScatterMul",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
-type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adagrad embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdagradParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
-
-// RandomCropSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomCropSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly crop `image`.
-//
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
-//
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
-//
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
-//
-// Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
-//
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomCrop",
-		Input: []tf.Input{
-			image, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
-type OutfeedDequeueTupleAttr func(optionalAttr)
-
-// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Retrieve multiple values from the computation outfeed.
-//
-// This operation will block indefinitely until data is available. Output `i`
-// corresponds to XLA tuple element `i`.
-//
-// Arguments:
-//	dtypes: The element types of each element in `outputs`.
-//	shapes: The shapes of each tensor in `outputs`.
-//
-// Returns A list of tensors that will be read from the outfeed.
-func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedDequeueTuple",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("OutfeedDequeueTuple", err)
-		return
-	}
-	return outputs
+	return scope.AddOperation(opspec)
 }
 
 // RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
@@ -17134,28 +17675,58 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
-type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+// Fetches multiple values from infeed as an XLA tuple.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be provided using the infeed mechanism.
+func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeueTuple",
 
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("InfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
+type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Load proximal Adagrad embedding parameters with debug support.
+// Load ADAM embedding parameters.
 //
 // An op that loads optimization parameters into HBM for embedding. Must be
 // preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
@@ -17164,14 +17735,14 @@ func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value stri
 // executed.
 //
 // Arguments:
-//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
 //
 //
 //
 // Returns the created operation.
-func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
+func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17180,293 +17751,15 @@ func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, param
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+		Type: "LoadTPUEmbeddingADAMParameters",
 		Input: []tf.Input{
-			parameters, accumulators, gradient_accumulators,
+			parameters, momenta, velocities,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
-//
-// Our Conv3D implements a form of cross-correlation.
-//
-// Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the product of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Prod",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalLatencyStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that will write to / read from a snapshot.
-//
-// This dataset attempts to determine whether a valid snapshot exists at the
-// `snapshot_path`, and reads from the snapshot in lieu of using `input_dataset`.
-// If not, it will run the preprocessing pipeline as usual, and write out a
-// snapshot of the data processed for future use.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	path: The path we should write snapshots to / read snapshots from.
-//
-//
-func SnapshotDataset(scope *Scope, input_dataset tf.Output, path tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SnapshotDataset",
-		Input: []tf.Input{
-			input_dataset, path,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
-//
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
-//
-// Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
-//
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "ParseTensor",
-		Input: []tf.Input{
-			serialized,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
-type OutfeedDequeueAttr func(optionalAttr)
-
-// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Retrieves a single tensor from the computation outfeed.
-//
-// This operation will block indefinitely until data is available.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
-//
-// Returns A tensor that will be read from the device outfeed.
-func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedDequeue",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Makes its input available to the next iteration.
-//
-// Arguments:
-//	data: The tensor to be made available to the next iteration.
-//
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NextIteration",
-		Input: []tf.Input{
-			data,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Connects N inputs to an N-way replicated TPU computation.
 func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
@@ -17482,118 +17775,161 @@ func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the product of all
-// entries belonging to a segment such that:
-//
-// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
-// `j...` such that `segment_ids[j...] == i`.
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 4,  6, 6, 4],
-// #       [5,  6, 7, 8]]
-// ```
-//
-// If there is no entry for a given segment ID `i`, it outputs 1.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
 //
 // Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentProd",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input, ksize, strides,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
+
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
 //
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
-// are sorted in non-decreasing order.
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
-
-// WholeFileReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the entire contents of a file as a value.
-//
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
-//
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17602,295 +17938,365 @@ func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
+		Type: "ParseSingleSequenceExample",
+		Input: []tf.Input{
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+}
 
+// An Op to sum inputs across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+// and `B, D, F, H` as group 1. Thus we get the outputs:
+// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//
+// Returns The sum of all the distributed inputs.
+func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CrossReplicaSum",
+		Input: []tf.Input{
+			input, group_assignment,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A TPU core selector Op.
+//
+// This Op produces a set of TPU cores (for warm-up) or a single TPU core
+// (for regular inference) to execute the TPU program on. The output is
+// consumed by TPUPartitionedCall.
+//
+// Returns A vector 1 or more TPU cores.
+func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUOrdinalSelector",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax activations.
+//
+// For each batch `i` and class `j` we have
+//
+//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
+//
+// pseudorandomly.
+//
+// Arguments:
+//
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ShuffleAndRepeatDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size, seed, seed2, count,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
 
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+// LRNBias sets the optional bias attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["bias"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["alpha"] = value
 	}
 }
 
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// LRNBeta sets the optional beta attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["beta"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// Local Response Normalization.
 //
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 //
 // Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// See `rgb_to_hsv` for a description of the HSV encoding.
-//
-// Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
-//
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			images,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SpaceToBatch for N-D tensors of type T.
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherBatchDims sets the optional batch_dims attribute to value.
+// If not specified, defaults to 0
+func ResourceGatherBatchDims(value int64) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["batch_dims"] = value
+	}
+}
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-// grid of blocks of shape `block_shape`, and interleaves these blocks with the
-// "batch" dimension (0) such that in the output, the spatial dimensions
-// `[1, ..., M]` correspond to the position within the grid, and the batch
-// dimension combines both the position within a spatial block and the original
-// batch position.  Prior to division into blocks, the spatial dimensions of the
-// input are optionally zero padded according to `paddings`.  See below for a
-// precise description.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has `M` dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// This operation is equivalent to the following steps:
-//
-// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-//    input according to `paddings` to produce `padded` of shape `padded_shape`.
-//
-// 2. Reshape `padded` to `reshaped_padded` of shape:
-//
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//        block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1],
-//       block_shape[M-1]] +
-//      remaining_shape
-//
-// 3. Permute dimensions of `reshaped_padded` to produce
-//    `permuted_reshaped_padded` of shape:
-//
-//      block_shape +
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-//    dimension, producing an output tensor of shape:
-//
-//      [batch * prod(block_shape)] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
 // ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-//     paddings = `[[0, 0], [2, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[8, 1, 3, 1]` and value:
-//
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SpaceToBatchND",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			input, block_shape, paddings,
+			resource, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Identity op for gradient debugging.
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
 //
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			input,
+			shape, rate,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
 // This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
 // instead of a sparse one.
 //
 // Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
@@ -17910,7 +18316,7 @@ func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
 //	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
 // Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17919,7 +18325,7 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "SparseReduceMax",
 		Input: []tf.Input{
 			input_indices, input_values, input_shape, reduction_axes,
 		},
@@ -17929,70 +18335,1587 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "Sinh",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
+type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParameters",
+		Input: []tf.Input{
+			parameters, accumulators, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
+}
+
+// CompilationResultProto indicating the status of the TPU compilation.
+func TPUCompilationResult(scope *Scope) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUCompilationResult",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
+type TPUReplicateMetadataAttr func(optionalAttr)
+
+// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
+//
+// value: Number of cores per replica. Used for model parallelism.
+// If not specified, defaults to 1
+func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["num_cores_per_replica"] = value
+	}
+}
+
+// TPUReplicateMetadataTopology sets the optional topology attribute to value.
+//
+// value: TopologyProto indicating the topology of the TPU pod slice.
+// If not specified, defaults to ""
+func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["topology"] = value
+	}
+}
+
+// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
+//
+// value: Whether to place the computation on the TPU.
+// If not specified, defaults to true
+func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["use_tpu"] = value
+	}
+}
+
+// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
+//
+// value: The assignment of devices for the computation.
+// If not specified, defaults to <>
+func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["device_assignment"] = value
+	}
+}
+
+// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
+//
+// value: DEPRECATED. Use num_cores_per_replica instead.
+// If not specified, defaults to <>
+func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["computation_shape"] = value
+	}
+}
+
+// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["host_compute_core"] = value
+	}
+}
+
+// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["padding_map"] = value
+	}
+}
+
+// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
+// If not specified, defaults to "STEP_MARK_AT_ENTRY"
+func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["step_marker_location"] = value
+	}
+}
+
+// Metadata indicaitng how the TPU computation should be replicated.
+//
+// Arguments:
+//	num_replicas: Number of replicas of the computation
+//
+// Returns the created operation.
+func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicateMetadata",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniform",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
+
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
+//
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An Op to permute tensors across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+// `[D, A, B, C]`.
+//
+// Arguments:
+//	input: The local input to be permuted. Currently only supports float and
+// bfloat16.
+//	source_target_pairs: A tensor with shape [num_pairs, 2].
+//
+// Returns The permuted input.
+func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectivePermute",
+		Input: []tf.Input{
+			input, source_target_pairs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adagrad embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Makes the summary of quantiles for the batch.
+//
+// An op that takes a list of tensors (one tensor per feature) and outputs the
+// quantile summaries for each tensor.
+//
+// Arguments:
+//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
+//	example_weights: float; Rank 1 Tensor with weights per instance.
+//	epsilon: float; The required maximum approximation error.
+//
+// Returns float; List of Rank 2 Tensors each containing the quantile summary
+// (value, weight, min_rank, max_rank) of a single feature.
+func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesMakeQuantileSummaries",
+		Input: []tf.Input{
+			tf.OutputList(float_values), example_weights, epsilon,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if summaries, idx, err = makeOutputList(op, idx, "summaries"); err != nil {
+		scope.UpdateErr("BoostedTreesMakeQuantileSummaries", err)
+		return
+	}
+	return summaries
+}
+
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
+
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
+type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load centered RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
+//	ms: Value of ms used in the centered RMSProp optimization algorithm.
+//	mom: Value of mom used in the centered RMSProp optimization algorithm.
+//	mg: Value of mg used in the centered RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom, mg,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Execute a sub graph on a remote processor.
+//
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
+//
+// Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
+//
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
+}
+
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+//
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
+//
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
+//
+// Arguments:
+//	bytes: A Tensor of string which is compressed.
+//
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCompressed",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Bessel i1e function of `x` element-wise.
+//
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+//
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BesselI1e",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TopK",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
+type InfeedEnqueueTupleAttr func(optionalAttr)
+
+// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for
+// all the tuple shapes, in the order the shapes appear in the "shapes" input.
+// The layout elements for a sub-shape can be set to -1, in which case the
+// corresponding layout will be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Feeds multiple Tensor values into the computation as an XLA tuple.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+//
+// Returns the created operation.
+func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueueTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Counts the number of occurrences of each value in an integer array.
+//
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Bincount",
+		Input: []tf.Input{
+			arr, size, weights,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionV2",
+		Input: []tf.Input{
+			boxes, scores, max_output_size, iou_threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `rate` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	rate: A scalar representing the sample rate of elements from the `input_dataset`
+// that should be taken.
+//	seed: A scalar representing seed of random number generator.
+//	seed2: A scalar representing seed2 of random number generator.
+//
+//
+func SamplingDataset(scope *Scope, input_dataset tf.Output, rate tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SamplingDataset",
+		Input: []tf.Input{
+			input_dataset, rate, seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
+//
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3DGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks a tensor for NaN and Inf values.
+//
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+//
+// Arguments:
+//
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"message": message}
+	opspec := tf.OpSpec{
+		Type: "CheckNumerics",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
+
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Computes the inverse of one or more square invertible matrices or their
+//
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixInverse",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// The gradient of SparseFillEmptyRows.
+//
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
+//
+// Arguments:
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
+//
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseFillEmptyRowsGrad",
+		Input: []tf.Input{
+			reverse_index_map, grad_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatefulUniformInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniformInt",
+		Input: []tf.Input{
+			resource, algorithm, shape, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
+
+// SvdComputeUv sets the optional compute_uv attribute to value.
+//
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["compute_uv"] = value
+	}
+}
+
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the singular value decompositions of one or more matrices.
+//
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+//
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Svd",
+		Input: []tf.Input{
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -18000,6 +19923,628 @@ func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes [
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Scatter `updates` into a new tensor according to `indices`.
+//
+// Creates a new tensor by applying sparse `updates` to individual values or
+// slices within a tensor (initially zero for numeric, empty for string) of
+// the given `shape` according to indices.  This operator is the inverse of the
+// `tf.gather_nd` operator which extracts values or slices from a given tensor.
+//
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNd",
+		Input: []tf.Input{
+			indices, updates, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns (x - y)(x - y) element-wise.
+//
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
+type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// PrelinearizeTupleAttr is an optional argument to PrelinearizeTuple.
+type PrelinearizeTupleAttr func(optionalAttr)
+
+// PrelinearizeTupleLayouts sets the optional layouts attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence for all the
+// tuple shapes in the order the shapes appear in the "shapes" input. The layout
+// elements for a sub-shape can be set to -1 in which case the corresponding layout
+// will be computed by the infeed operation.
+// If not specified, defaults to <>
+func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
+	return func(m optionalAttr) {
+		m["layouts"] = value
+	}
+}
+
+// An op which linearizes multiple Tensor values to an opaque variant tensor.
+//
+// Arguments:
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
+func PrelinearizeTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...PrelinearizeTupleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrelinearizeTuple",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
+//
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
+//
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PackAttr is an optional argument to Pack.
+type PackAttr func(optionalAttr)
+
+// PackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to pack.  Negative values wrap around, so the
+// valid range is `[-(R+1), R+1)`.
+// If not specified, defaults to 0
+func PackAxis(value int64) PackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
+//
+// Packs the `N` tensors in `values` into a tensor with rank one higher than each
+// tensor in `values`, by packing them along the `axis` dimension.
+// Given a list of tensors of shape `(A, B, C)`;
+//
+// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+// Etc.
+//
+// For example:
+//
+// ```
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
+//
+// This is the opposite of `unpack`.
+//
+// Arguments:
+//	values: Must be of same shape and type.
+//
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Pack",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise integer closest to x.
+//
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
+//
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rint",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CollectiveReduceAttr is an optional argument to CollectiveReduce.
+type CollectiveReduceAttr func(optionalAttr)
+
+// CollectiveReduceWaitFor sets the optional wait_for attribute to value.
+// If not specified, defaults to <>
+func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
+	return func(m optionalAttr) {
+		m["wait_for"] = value
+	}
+}
+
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64, optional ...CollectiveReduceAttr) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectiveReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
+type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingMomentumParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Encodes a `RaggedTensor` into a `variant` Tensor.
+//
+//
+// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
+// `batched_input` is True, then input `RaggedTensor` is unbatched along the
+// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
+// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
+// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
+// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
+// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
+// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
+// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
+// corresponding decoding logic.
+//
+//
+// Arguments:
+//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
+// `RaggedTensor`.
+//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
+//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
+//
+// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
+func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"batched_input": batched_input}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToVariant",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An op enabling differentiation of TPU Embeddings.
+//
+// This op simply returns its first input, which is assumed to have been sliced
+// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+// this op, and its first argument being a trainable Variable, enables automatic
+// differentiation of graphs containing embeddings via the TPU Embedding Python
+// libraries.
+//
+// Arguments:
+//	embedding_variable: A trainable variable, enabling optimizers to find this op.
+//	sliced_activations: The embedding activations Tensor to return.
+//	table_id: The id of the table in the embedding layer configuration from which
+// these activations were computed.
+//	lookup_id: Identifier of the set of embedding indices which produced these
+// activations.
+func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
+	opspec := tf.OpSpec{
+		Type: "TPUEmbeddingActivations",
+		Input: []tf.Input{
+			embedding_variable, sliced_activations,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Generates values in an interval.
 //
 // A sequence of `num` evenly-spaced values are generated beginning at `start`.
@@ -18032,68 +20577,224 @@ func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (out
 	return op.Output(0)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
+		Type: "TanhGrad",
 		Input: []tf.Input{
-			indices, values, shape, start, size,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
+// Returns the set of files matching one or more glob patterns.
 //
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			x, y,
+			pattern,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
+type SdcaOptimizerV2Attr func(optionalAttr)
+
+// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
+//
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
+	return func(m optionalAttr) {
+		m["adaptive"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizerV2",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of the variable pointed to by `resource`.
+//
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VariableShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Enqueue multiple Tensor values on the computation outfeed.
 //
 // Arguments:
@@ -18114,88 +20815,6 @@ func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
-
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StageClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
-//
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
 type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
@@ -18251,59 +20870,87 @@ func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, l
 	return scope.AddOperation(opspec)
 }
 
-// The gradient operator for the SparseSlice op.
+// Inverse fast Fourier transform.
 //
-// This op takes in the upstream gradient w.r.t. non-empty values of
-// the sliced `SparseTensor`, and outputs the gradients w.r.t.
-// the non-empty values of input `SparseTensor`.
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
 // Arguments:
-//	backprop_val_grad: 1-D. The gradient with respect to
-// the non-empty values of the sliced `SparseTensor`.
-//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
-//	input_start: 1-D. tensor represents the start of the slice.
-//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
+//	input: A complex tensor.
 //
-// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
-func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSliceGrad",
+		Type: "IFFT",
 		Input: []tf.Input{
-			backprop_val_grad, input_indices, input_start, output_indices,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatefulUniformAttr is an optional argument to StatefulUniform.
-type StatefulUniformAttr func(optionalAttr)
-
-// StatefulUniformDtype sets the optional dtype attribute to value.
+// Computes gradients for the scaled exponential linear (Selu) operation.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulUniformDtype(value tf.DataType) StatefulUniformAttr {
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
+//
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SeluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
+
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["output_type"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Usage:
+//   ```python
+//   import tensorflow as tf
+//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
+//   b = tf.math.argmin(input = a)
+//   c = tf.keras.backend.eval(b)
+//   # c = 0
+//   # here a[0] = 1 which is the smallest element of a across axis 0
+//   ```
 //
 // Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
 //
-// Returns Random values with specified shape.
-func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformAttr) (output tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18312,9 +20959,9 @@ func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shap
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatefulUniform",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			resource, algorithm, shape,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -18322,24 +20969,410 @@ func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shap
 	return op.Output(0)
 }
 
-// Sends `input` to all devices that are connected to the output.
+// Returns the truth value of (x != y) element-wise.
 //
-// Sends `input` to all devices that are connected to the output.
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NotEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds a value to the current value of a variable.
 //
-// The graph should be constructed so that all ops connected to the output have a
-// valid device assignment, and the op itself is assigned one of these devices.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
 //
-// input: The input to the broadcast.
-// output: The same as input.
-// shape: The shape of the input tensor.
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "NcclBroadcast",
+		Type: "AccumulateNV2",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
+
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+//
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
+	return func(m optionalAttr) {
+		m["skip_empty"] = value
+	}
+}
+
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplit",
+		Input: []tf.Input{
+			input, delimiter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns a batched matrix tensor with new batched diagonal values.
+//
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
+//
+// The output is computed as follows:
+//
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+//
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+//
+// Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSetDiag",
+		Input: []tf.Input{
+			input, diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
+type LeakyReluGradAttr func(optionalAttr)
+
+// LeakyReluGradAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// Computes rectified linear gradients for a LeakyRelu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
+//	features: The features passed as input to the corresponding LeakyRelu operation,
+// OR the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
+func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LeakyReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adadelta scheme.
+//
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
+
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageSizeSharedName(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
 			input,
 		},
@@ -18349,74 +21382,348 @@ func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Out
 	return op.Output(0)
 }
 
-// PrefetchDatasetAttr is an optional argument to PrefetchDataset.
-type PrefetchDatasetAttr func(optionalAttr)
-
-// PrefetchDatasetSlackPeriod sets the optional slack_period attribute to value.
-// If not specified, defaults to 0
-func PrefetchDatasetSlackPeriod(value int64) PrefetchDatasetAttr {
-	return func(m optionalAttr) {
-		m["slack_period"] = value
-	}
-}
-
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+// Worker heartbeat op.
+//
+// Heartbeats may be sent periodically to indicate the coordinator is still active,
+// to retrieve the current worker status and to expedite shutdown when necessary.
 //
 // Arguments:
+//	request: A string tensor containing a serialized WorkerHeartbeatRequest
 //
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
-//
-//
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...PrefetchDatasetAttr) (handle tf.Output) {
+// Returns A string tensor containing a serialized WorkerHeartbeatResponse
+func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "WorkerHeartbeat",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			request,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
-type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+// Returns element-wise smallest integer not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// Computes square of x element-wise.
+//
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Square",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sign and the log of the absolute value of the determinant of
+//
+// one or more square matrices.
+//
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
+//
+// Arguments:
+//	input: Shape is `[N, M, M]`.
+//
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogMatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
+
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AdaMax algorithm.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdaMax",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Inv",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+//
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
+//
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeIterator",
+		Input: []tf.Input{
+			dataset, iterator,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixLogarithm",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
+type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
+func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
+func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Retrieve proximal Adagrad embedding parameters.
+// Load Adagrad embedding parameters.
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
+// Arguments:
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18425,137 +21732,125 @@ func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int6
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
-
+		Type: "LoadTPUEmbeddingAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
-
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+// Quantized Batch normalization.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the trignometric inverse tangent of x element-wise.
+//
+// The `tf.math.atan` operation returns the inverse of `tf.math.tan`, such that
+// if `y = tf.math.tan(x)` then, `x = tf.math.atan(y)`.
+//
+// **Note**: The output of `tf.math.atan` will lie within the invertible range
+// of tan, i.e (-pi/2, pi/2).
+//
+// For example:
+//
+// ```python
+// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
+// x = tf.constant([1.047, 0.785])
+// y = tf.math.tan(x) # [1.731261, 0.99920404]
+//
+// tf.math.atan(y) # [1.047, 0.785] = x
+// ```
+//
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Performs 3D average pooling on the input.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+//
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringLengthAttr is an optional argument to StringLength.
-type StringLengthAttr func(optionalAttr)
-
-// StringLengthUnit sets the optional unit attribute to value.
-//
-// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
-// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
-// encoded Unicode code points in each string).  Results are undefined
-// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
-// valid UTF-8.
-// If not specified, defaults to "BYTE"
-func StringLengthUnit(value string) StringLengthAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// String lengths of `input`.
-//
-// Computes the length of each string given in the input tensor.
-//
-// Arguments:
-//	input: The string for which to compute the length.
-//
-// Returns Integer tensor that has the same shape as `input`. The output contains the
-// element-wise string lengths of `input`.
-func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18564,158 +21859,1815 @@ func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringLength",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			input,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Says whether the targets are in the top `K` predictions.
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adadelta embedding parameters with debug support.
 //
-// More formally, let
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
-		Input: []tf.Input{
-			predictions, targets,
-		},
+		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Mutually accumulates multiple tensors of identical type and shape.
-func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "CollectiveGather",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Pop the element at the top of the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
-//
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
-	opspec := tf.OpSpec{
-		Type: "StackPopV2",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
-//
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-//
-//     empty_row_indicator[i] = True iff row i was an empty row.
-//
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
-//
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
-//
-// Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
-//
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
-		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
+type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, linears, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output the logits for the given input data
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//	dense_features: Rank 2 dense features tensor.
+//	logits_dimension: Scalar, dimension of the logits.
+//
+// Returns The logits predictions from the tree for each instance in the batch.
+func TensorForestTreePredict(scope *Scope, tree_handle tf.Output, dense_features tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreePredict",
+		Input: []tf.Input{
+			tree_handle, dense_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Outputs the single element from the given dataset.
+//
+// Arguments:
+//	dataset: A handle to a dataset that contains a single element.
+//
+//
+//
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DatasetToSingleElement",
+		Input: []tf.Input{
+			dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
+	}
+	return components
+}
+
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayScatterV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV2",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+//
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan2",
+		Input: []tf.Input{
+			y, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
+
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+//
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
+//
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddManySparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
+type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
+
+// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
+// and = 0 when the Op is running on the CPU device.
+// If not specified, defaults to -1
+func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which enqueues prelinearized buffer into TPU infeed.
+//
+// Arguments:
+//	input: A variant tensor representing linearized output.
+//
+// Returns the created operation.
+func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueuePrelinearizedBuffer",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Ensures that the tensor's shape matches the expected shape.
+//
+// Raises an error if the input tensor's shape does not match the specified shape.
+// Returns the input tensor otherwise.
+//
+// Arguments:
+//	input: A tensor, whose shape is to be validated.
+//	shape: The expected (possibly partially specified) shape of the input tensor.
+//
+// Returns A tensor with the same shape and contents as the input tensor or value.
+func EnsureShape(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "EnsureShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
+type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adadelta embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes rectified linear gradients for a Relu operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
+//
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReluGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the determinant of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Connects outputs of an N-way replicated computation to N outputs.
+func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicatedOutput",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("TPUReplicatedOutput", err)
+		return
+	}
+	return outputs
+}
+
+// Makes a copy of `x`.
+//
+// Arguments:
+//	x: The source tensor of type `T`.
+//
+// Returns     y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+//       is not an alias of `x`.
+func DeepCopy(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeepCopy",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Divides sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterDiv",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Deprecated. Use TensorArraySizeV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySizeV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Lgamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SnapshotDatasetAttr is an optional argument to SnapshotDataset.
+type SnapshotDatasetAttr func(optionalAttr)
+
+// SnapshotDatasetCompression sets the optional compression attribute to value.
+// If not specified, defaults to ""
+func SnapshotDatasetCompression(value string) SnapshotDatasetAttr {
+	return func(m optionalAttr) {
+		m["compression"] = value
+	}
+}
+
+// SnapshotDatasetReaderPathPrefix sets the optional reader_path_prefix attribute to value.
+// If not specified, defaults to ""
+func SnapshotDatasetReaderPathPrefix(value string) SnapshotDatasetAttr {
+	return func(m optionalAttr) {
+		m["reader_path_prefix"] = value
+	}
+}
+
+// SnapshotDatasetWriterPathPrefix sets the optional writer_path_prefix attribute to value.
+// If not specified, defaults to ""
+func SnapshotDatasetWriterPathPrefix(value string) SnapshotDatasetAttr {
+	return func(m optionalAttr) {
+		m["writer_path_prefix"] = value
+	}
+}
+
+// Creates a dataset that will write to / read from a snapshot.
+//
+// This dataset attempts to determine whether a valid snapshot exists at the
+// `snapshot_path`, and reads from the snapshot in lieu of using `input_dataset`.
+// If not, it will run the preprocessing pipeline as usual, and write out a
+// snapshot of the data processed for future use.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	path: The path we should write snapshots to / read snapshots from.
+//
+//
+func SnapshotDataset(scope *Scope, input_dataset tf.Output, path tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...SnapshotDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SnapshotDataset",
+		Input: []tf.Input{
+			input_dataset, path,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseSequence",
+		Input: []tf.Input{
+			input, seq_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum - lr * grad
+// var += accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyKerasMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
+type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear: `max(features, 0)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exp",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be provided using the infeed mechanism.
+func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "InfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
+type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingFTRLParameters",
+		Input: []tf.Input{
+			parameters, accumulators, linears,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Gradients for batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalizationGrad",
+		Input: []tf.Input{
+			t, m, v, gamma, backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that executes a SQL query and emits rows of the result set.
+//
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSqlDataset",
+		Input: []tf.Input{
+			driver_name, data_source_name, query,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
+type OrderedMapUnstageAttr func(optionalAttr)
+
+// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstage", err)
+		return
+	}
+	return values
+}
+
+// Computes the number of elements in the given queue.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueSizeV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
+type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
+
+// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$vhat_t := max{vhat_{t-1}, v_t}$$
+// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	vhat: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdamWithAmsgrad",
+		Input: []tf.Input{
+			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a tree ensemble model and returns a handle to it.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+//	stamp_token: Token to use as the initial value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
+//
+// Returns the created operation.
+func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+//
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "QuantizeDownAndShrinkRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
 type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
 
@@ -18790,170 +23742,77 @@ func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, em
 	return scope.AddOperation(opspec)
 }
 
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
+type UnicodeEncodeAttr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+// UnicodeEncodeErrors sets the optional errors attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["errors"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD (U+65533).
+// If not specified, defaults to 65533
+func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["replacement_char"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// Creates an empty hash table.
+// Encode a tensor of ints into unicode strings.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Returns a vector of strings, where `output[i]` is constructed by encoding the
+// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+// using `output_encoding`.
+//
+// ---
+//
+// Example:
+//
+// ```
+// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+// input_splits = [0, 5, 10]
+// output_encoding = 'UTF-8'
+//
+// output = ['Hello', 'World']
+// ```
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
+//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
+// In particular, `output[i]` is constructed by encoding the codepoints in the
+// slice `input_values[input_splits[i]:input_splits[i+1]]`.
+//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+// "UTF-16-BE", and "UTF-32-BE"`.
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
+func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{"output_encoding": output_encoding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
-
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["normalized"] = value
-	}
-}
-
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
-//
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["uniform_noise"] = value
-	}
-}
-
-// ExtractGlimpseNoise sets the optional noise attribute to value.
-//
-// value: indicates if the noise should `uniform`, `gaussian`, or
-// `zero`. The default is `uniform` which means the the noise type
-// will be decided by `uniform_noise`.
-// If not specified, defaults to "uniform"
-func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["noise"] = value
-	}
-}
-
-// Extracts a glimpse from the input tensor.
-//
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
-//
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
-//
-// The argument `normalized` and `centered` controls how the windows are built:
-//
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
-//
-// Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
-//
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "UnicodeEncode",
 		Input: []tf.Input{
-			input, size, offsets,
+			input_values, input_splits,
 		},
 		Attrs: attrs,
 	}
@@ -18961,28 +23820,35 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
 
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["message"] = value
 	}
 }
 
-// Returns the shape of the variable pointed to by `resource`.
+// An identity op that triggers an error if a gradient is requested.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-// For example:
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18991,7 +23857,7 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
+		Type: "PreventGradient",
 		Input: []tf.Input{
 			input,
 		},
@@ -19001,183 +23867,10 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softplus",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Converts the quantized `input` tensor into a lower-precision `output`.
-//
-// Converts the quantized `input` tensor into a lower-precision `output`, using the
-// output range specified with `requested_output_min` and `requested_output_max`.
-//
-// `[input_min, input_max]` are scalar floats that specify the range for the float
-// interpretation of the `input` data. For example, if `input_min` is -1.0f and
-// `input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "Requantize",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
-
-// SerializeSparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the number of elements in the given table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
-		Input: []tf.Input{
-			table_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns 0 if x == 0, and x / y otherwise, elementwise.
-func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Xdivy",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
 //
 // value: The data format of the input and output data. With the
 // default format "NDHWC", the data is stored in the order of:
@@ -19185,35 +23878,51 @@ type MaxPool3DGradAttr func(optionalAttr)
 // Alternatively, the format could be "NCDHW", the data storage order is:
 //     [batch, in_channels, in_depth, in_height, in_width].
 // If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "Conv3DBackpropFilterV2",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -19221,77 +23930,36 @@ func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 	return op.Output(0)
 }
 
-// Computes gradients for the scaled exponential linear (Selu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
-//
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SeluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
+type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
 
-// Returns a constant tensor on the host. Only for writing C++ tests.
-//
-// Arguments:
-//	value: Attr `value` is the tensor to return.
-//
-func HostConst(scope *Scope, value tf.Tensor, dtype tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"value": value, "dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "HostConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
-type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Retrieve Momentum embedding parameters.
+// Retrieve RMSProp embedding parameters with debug support.
 //
 // An op that retrieves optimization parameters from embedding to host
 // memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
 // the correct embedding table configuration. For example, this op is
 // used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19300,111 +23968,433 @@ func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shar
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParameters",
+		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
+
+// RandomUniformIntSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Zeta",
+		Input: []tf.Input{
+			x, q,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
+// `j...` such that `segment_ids[j...] == i`.
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  6, 6, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentProd",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs all keys and values in the table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+//
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "LookupTableExportV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Returns x + y element-wise.
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
+
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			x, y,
+			true_classes,
 		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
 
-// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
-type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
-
-// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["compression"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
+// PNG-encode an image.
 //
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$vhat_t := max{vhat_{t-1}, v_t}$$
-// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
+//
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
+//
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	vhat: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns the created operation.
-func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19413,78 +24403,9 @@ func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdamWithAmsgrad",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the number of elements in the given queue.
-//
-// Arguments:
-//	handle: The handle to a queue.
-//
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
-
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using nearest neighbor interpolation.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
-		Input: []tf.Input{
-			images, size,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -19492,163 +24413,6 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
-}
-
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
-		Input: []tf.Input{
-			indices, values, dense_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset with a range of values. Corresponds to python's xrange.
-//
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
-//
-//
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RangeDataset",
-		Input: []tf.Input{
-			start, stop, step,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
-//
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Enqueue a Tensor on the computation outfeed.
-//
-// Arguments:
-//	input: A tensor that will be inserted into the outfeed queue.
-//
-// Returns the created operation.
-func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OutfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
 type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
 
@@ -19695,68 +24459,19 @@ func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
-		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Store the input tensor in the state of the current session.
 //
 // Arguments:
 //	value: The tensor to be stored.
 //
 // Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "GetSessionHandleV2",
 		Input: []tf.Input{
 			value,
 		},
@@ -19765,684 +24480,67 @@ func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator as an Optional variant.
-func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IteratorGetNextAsOptional",
-		Input: []tf.Input{
-			iterator,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An op enabling differentiation of TPU Embeddings.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// This op simply returns its first input, which is assumed to have been sliced
-// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
-// this op, and its first argument being a trainable Variable, enables automatic
-// differentiation of graphs containing embeddings via the TPU Embedding Python
-// libraries.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	embedding_variable: A trainable variable, enabling optimizers to find this op.
-//	sliced_activations: The embedding activations Tensor to return.
-//	table_id: The id of the table in the embedding layer configuration from which
-// these activations were computed.
-//	lookup_id: Identifier of the set of embedding indices which produced these
-// activations.
-func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
-	opspec := tf.OpSpec{
-		Type: "TPUEmbeddingActivations",
-		Input: []tf.Input{
-			embedding_variable, sliced_activations,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Assigns a new value to a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
-//
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// An op that receives embedding activations on the TPU.
-//
-// The TPU system performs the embedding lookups and aggregations specified by
-// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
-// results of these aggregations are visible to the Tensorflow Graph as the
-// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
-// one Tensor of activations per table specified in the model. There can be at
-// most one RecvTPUEmbeddingActivations op in the TPU graph.
-//
-// Arguments:
-//	num_outputs: The number of output activation tensors, equal to the number of
-// embedding tables in the model.
-//	config: Serialized TPUEmbeddingConfiguration proto.
-//
-// Returns A TensorList of embedding activations containing one Tensor per
-// embedding table in the model.
-func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
-	opspec := tf.OpSpec{
-		Type: "RecvTPUEmbeddingActivations",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
-		return
-	}
-	return outputs
-}
-
-// Outputs a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
-//
-// This op reports an `InvalidArgument` error if any value is not finite.
-//
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
-		Input: []tf.Input{
-			tag, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. The parameters may each be a
-//
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
-//
-// Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
-//
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
-		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
-//
-// Arguments:
-//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
-// `N` data inputs should produce the next output element.
-//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
-// the values of `selector_input_dataset`.
-//
-//
-func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDirectedInterleaveDataset",
-		Input: []tf.Input{
-			selector_input_dataset, tf.OutputList(data_input_datasets),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Performs gradient updates of embedding tables.
-//
-// Arguments:
-//	inputs: A TensorList of gradients with which to update embedding tables.
-// This argument has the same length and shapes as the return value of
-// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-// with respect to the embedding activations. The embedding tables are updated
-// from these gradients via the optimizer specified in the TPU embedding
-// configuration given to tpu.initialize_system.
-//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
-// rate tag: see the comments in
-// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
-// Multiple tables can share the same dynamic learning rate tag as specified
-// in the configuration. If the learning rates for all tables are constant,
-// this list should be empty.
-//	config: Serialized TPUEmbeddingConfiguration proto.
-//
-// Returns the created operation.
-func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"config": config}
-	opspec := tf.OpSpec{
-		Type: "SendTPUEmbeddingGradients",
-		Input: []tf.Input{
-			tf.OutputList(inputs), tf.OutputList(learning_rates),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
-//
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
-
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeNearestNeighborGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradHalfPixelCenters(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Computes the gradient of nearest neighbor interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
-		Input: []tf.Input{
-			grads, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
-
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
 // SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
-//
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, delta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.
-type QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QUINT8
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D with Bias, Relu and Requantize.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	min_freezed_output: The minimum float value of the output tensor.
-//	max_freezed_output: The maximum float value of the output tensor.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
-		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter, min_freezed_output, max_freezed_output,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
-
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
-	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
-	}
-}
-
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
-		Input: []tf.Input{
-			input, size, paddings, filter,
-		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-//   This op is used as a placeholder in If branch functions. It doesn't provide a
-//   valid output when run, so must either be removed (e.g. replaced with a
-//   function input) or guaranteed not to be used (e.g. if mirroring an
-//   intermediate output needed for the gradient computation of the other branch).
-//
-// Arguments:
-//	dtype: The type of the output.
-//	shape:     The purported shape of the output. This is only used for shape inference;
-//     the output will not necessarily have this shape. Can be a partial shape.
-//
-// Returns     \"Fake\" output value. This should not be consumed by another op.
-func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "FakeParam",
+// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
+type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
 
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
-type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// Load RMSProp embedding parameters with debug support.
+// Retrieve RMSProp embedding parameters.
 //
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20451,41 +24549,25 @@ func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, ms, mom, gradient_accumulators,
-		},
+		Type: "RetrieveTPUEmbeddingRMSPropParameters",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Delete the stack from its resource container.
+// Returns which elements of x are NaN.
 //
-// Arguments:
-//	handle: The handle to a stack.
-//
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atanh",
+		Type: "IsNan",
 		Input: []tf.Input{
 			x,
 		},
@@ -20494,320 +24576,506 @@ func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Serializes the tree ensemble to a proto.
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeAxis sets the optional axis attribute to value.
+//
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
+	}
+}
+
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
+//	input: The `input` to squeeze.
 //
-// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
-func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Squeeze",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pads a tensor with mirrored values.
+//
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
+//
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
+
+// AllKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical and" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "All",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
+
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+//
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
+	}
+}
+
+// Extract the shape information of a JPEG-encoded image.
+//
+// This op only parses the image header, so it is much faster than DecodeJpeg.
+//
+// Arguments:
+//	contents: 0-D. The JPEG-encoded image.
+//
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractJpegShape",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesSerializeEnsemble",
+		Type: "DecodeJSONExample",
 		Input: []tf.Input{
-			tree_ensemble_handle,
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Forwards the value of an available tensor from `inputs` to `output`.
+//
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
+//
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Merge",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
-//
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
-//
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
-//
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
-		Input: []tf.Input{
-			tags, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
+// Retrieve centered RMSProp embedding parameters.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise left-shift of `x` and `y`.
-//
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
-//
-// For example:
-//
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
-//
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
-//
-// Returns A tensor that will be provided using the infeed mechanism.
-func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeue",
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Batch normalization.
+// Adjust the saturation of one or more images.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
+//
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "AdjustSaturation",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			images, scale,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Initializes the multi device iterator with the given dataset.
+// MaxPoolGradGradWithArgmaxAttr is an optional argument to MaxPoolGradGradWithArgmax.
+type MaxPoolGradGradWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolGradGradWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolGradGradWithArgmaxIncludeBatchInIndex(value bool) MaxPoolGradGradWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradWithArgmaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
+			input, grad, argmax,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -20913,35 +25181,66 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// FusedBatchNormGradV3Attr is an optional argument to FusedBatchNormGradV3.
+type FusedBatchNormGradV3Attr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// FusedBatchNormGradV3Epsilon sets the optional epsilon attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV3Epsilon(value float32) FusedBatchNormGradV3Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["epsilon"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// FusedBatchNormGradV3DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV3DataFormat(value string) FusedBatchNormGradV3Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV3IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV3IsTraining(value bool) FusedBatchNormGradV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//	reserve_space_3: When is_training is True, a 1D Tensor for some intermediate results to be reused
+// in gradient computation. When is_training is False, a dummy empty Tensor will be
+// created.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV3(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, reserve_space_3 tf.Output, optional ...FusedBatchNormGradV3Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_4 tf.Output, reserve_space_5 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20950,36 +25249,148 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "FusedBatchNormGradV3",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2, reserve_space_3,
 		},
 		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+//
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
+//
+// Arguments:
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+//
+// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToSparse",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Table initializer that takes two tensors for keys and values respectively.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
+//
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
 	return scope.AddOperation(opspec)
 }
 
-// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
-type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
-// and = 0 when the Op is running on the CPU device.
-// If not specified, defaults to -1
-func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["device_ordinal"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// An op which enqueues prelinearized buffer into TPU infeed.
+// ReduceJoinSeparator sets the optional separator attribute to value.
+//
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins a string Tensor across the given dimensions.
+//
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+// indices are not specified, joins across all dimensions beginning from `n - 1`
+// through `0`.
+//
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> "acbd"
+// tf.reduce_join(a, [1, 0]) ==> "abcd"
+// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
 //
 // Arguments:
-//	input: A variant tensor representing linearized output.
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// Returns the created operation.
-func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20988,84 +25399,78 @@ func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "InfeedEnqueuePrelinearizedBuffer",
+		Type: "ReduceJoin",
+		Input: []tf.Input{
+			inputs, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
+//
+// This operation returns a 1-D integer tensor representing the shape of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
-type OrderedMapUnstageAttr func(optionalAttr)
-
-// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Returns the last element of the input list as well as a list with all but that element.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Fails if the list is empty.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstage",
+		Type: "TensorListPopBack",
 		Input: []tf.Input{
-			key, indices,
+			input_handle, element_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstage", err)
-		return
-	}
-	return values
+	return op.Output(0), op.Output(1)
 }
 
 // MaxPoolGradAttr is an optional argument to MaxPoolGrad.
@@ -21116,6 +25521,3415 @@ func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad
 	return op.Output(0)
 }
 
+// Reorders a SparseTensor into the canonical, row-major ordering.
+//
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
+//
+// Reordering does not affect the shape of the SparseTensor.
+//
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReorder",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
+type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
+
+// CudnnRNNParamsToCanonicalRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsToCanonicalRnnMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsToCanonicalInputMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsToCanonicalDirection(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalDropout(value float32) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Retrieves CudnnRNN params in canonical form.
+//
+// Retrieves a set of weights from the opaque params buffer that can be saved and
+// restored in a way compatible with future runs.
+//
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_params": num_params}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsToCanonical",
+		Input: []tf.Input{
+			num_layers, num_units, input_size, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	return weights, biases
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalLatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D real-valued fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Checks whether a resource handle-based variable has been initialized.
+//
+// Arguments:
+//	resource: the input resource handle.
+//
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "VarIsInitializedOp",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs gradient updates of embedding tables.
+//
+// Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "SendTPUEmbeddingGradients",
+		Input: []tf.Input{
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
+type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["max_sequence_lengths"] = value
+	}
+}
+
+// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+//
+// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+// to the ith feature. table_ids[i] indicates which embedding table to look up ith
+// feature.
+//
+// The tensors at corresponding positions in the three input lists (sample_indices,
+// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+// with dim_size() equal to the total number of lookups into the table described by
+// the corresponding feature.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example to
+// which the corresponding embedding_indices and aggregation_weights values
+// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+// It corresponds to sp_ids.values in embedding_lookup_sparse().
+//	aggregation_weights: A list of rank 1 Tensors containing per training example
+// aggregation weights. It corresponds to sp_weights.values in
+// embedding_lookup_sparse().
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	table_ids: A list of integers specifying the identifier of the embedding table
+// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+// corresponding input. The ith input is looked up using table_ids[i]. The size
+// of the table_ids list must be equal to that of sample_indices,
+// embedding_indices and aggregation_weights.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_ids": table_ids}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu6",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Strip leading and trailing whitespaces from the Tensor.
+//
+// Arguments:
+//	input: A string `Tensor` of any shape.
+//
+// Returns A string `Tensor` of the same shape as the input.
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StringStrip",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
+
+// TensorSummaryDescription sets the optional description attribute to value.
+//
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["description"] = value
+	}
+}
+
+// TensorSummaryLabels sets the optional labels attribute to value.
+//
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
+	}
+}
+
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
+//
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
+//
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
+//
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummary",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An Op to exchange data across TPU replicas.
+//
+// On each replica, the input is split into `split_count` blocks along
+// `split_dimension` and send to the other replicas given group_assignment. After
+// receiving `split_count` - 1 blocks from other replicas, we concatenate the
+// blocks along `concat_dimension` as the output.
+//
+// For example, suppose there are 2 TPU replicas:
+// replica 0 receives input: `[[A, B]]`
+// replica 1 receives input: `[[C, D]]`
+//
+// group_assignment=`[[0, 1]]`
+// concat_dimension=0
+// split_dimension=1
+// split_count=2
+//
+// replica 0's output: `[[A], [C]]`
+// replica 1's output: `[[B], [D]]`
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//	concat_dimension: The dimension number to concatenate.
+//	split_dimension: The dimension number to split.
+//	split_count: The number of splits, this number must equal to the sub-group
+// size(group_assignment.get_shape()[1])
+//
+// Returns The exchanged result.
+func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
+	opspec := tf.OpSpec{
+		Type: "AllToAll",
+		Input: []tf.Input{
+			input, group_assignment,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the number of nodes in a tree
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//
+// Returns The size of the tree.
+func TensorForestTreeSize(scope *Scope, tree_handle tf.Output) (tree_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
+//
+// *NOTE*: `MulNoNan` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MulNoNan",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
+
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the permuted vector/tensor in the destination data format given the
+//
+// one in the source data format.
+//
+// Arguments:
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatVecPermute",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `N` tensors along the first dimension.
+//
+// The input tensors are all required to have size 1 in the first dimension.
+//
+// For example:
+//
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
+//
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
+//
+// Arguments:
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
+//
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "ParallelConcat",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
+type ConfigureDistributedTPUAttr func(optionalAttr)
+
+// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
+//
+// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["tpu_embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to false
+func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["is_global_init"] = value
+	}
+}
+
+// Sets up the centralized structures for a distributed TPU system.
+//
+// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
+// topology.
+func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ConfigureDistributedTPU",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
+type UnicodeTranscodeAttr func(optionalAttr)
+
+// UnicodeTranscodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+//
+// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+// as ' ', will preserve string alignment to the source since invalid bytes will be
+// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+// replacement character will preserve byte alignment to the source.
+// If not specified, defaults to 65533
+func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Transcode the input text from a source encoding to a destination encoding.
+//
+// The input is a string tensor of any shape. The output is a string tensor of
+// the same shape containing the transcoded strings. Output strings are always
+// valid unicode. If the input contains invalid encoding positions, the
+// `errors` attribute sets the policy for how to deal with them. If the default
+// error-handling policy is used, invalid formatting will be substituted in the
+// output by the `replacement_char`. If the errors policy is to `ignore`, any
+// invalid encoding positions in the input are skipped and not included in the
+// output. If it set to `strict` then any invalid formatting will result in an
+// InvalidArgument error.
+//
+// This operation can be used with `output_encoding = input_encoding` to enforce
+// correct formatting for inputs even if they are already in the desired encoding.
+//
+// If the input is prefixed by a Byte Order Mark needed to determine encoding
+// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+// BOM will be consumed and not emitted into the output. If the input encoding
+// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+// interpreted as a non-breaking-space and is preserved in the output (including
+// always for UTF-8).
+//
+// The end result is that if the input is marked as an explicit endianness the
+// transcoding is faithful to all codepoints in the source. If it is not marked
+// with an explicit endianness, the BOM is not considered part of the string itself
+// but as metadata, and so is not preserved in the output.
+//
+// Arguments:
+//	input: The text to be processed. Can have any shape.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	output_encoding: The unicode encoding to use in the output. Must be one of
+// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+//
+// Returns A string tensor containing unicode text encoded using `output_encoding`.
+func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeTranscode",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+//
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
+//
+// All subsequent operations using the resource will result in a NotFound
+// error status.
+//
+// Arguments:
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DestroyResourceOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableOfTensorsV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Shuts down a running distributed TPU system.
+//
+// The op returns an error if no system is running.
+//
+// Returns the created operation.
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShutdownDistributedTPU",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+//
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SdcaOptimizer",
+		Input: []tf.Input{
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+}
+
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// # ==> [[5, 5, 5, 5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentSum",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of x element-wise.
+//
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash can be used to make it difficult to find inputs with a skewed hash value
+// distribution over buckets. This requires that the hash function is
+// seeded by a high-entropy (random) "key" unknown to the adversary.
+//
+// The additional robustness comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key used to seed the hash function, passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketStrong",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts the given variant tensor to an iterator and stores it in the given resource.
+//
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
+//
+// Returns the created operation.
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeserializeIterator",
+		Input: []tf.Input{
+			resource_handle, serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
+
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGradV2",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// A placeholder op that passes through `input` when its output is not fed.
+//
+// Arguments:
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
+//
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "PlaceholderWithDefault",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIteratorGetDevice",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringFormatAttr is an optional argument to StringFormat.
+type StringFormatAttr func(optionalAttr)
+
+// StringFormatTemplate sets the optional template attribute to value.
+//
+// value: A string, the template to format tensor summaries into.
+// If not specified, defaults to "%s"
+func StringFormatTemplate(value string) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["template"] = value
+	}
+}
+
+// StringFormatPlaceholder sets the optional placeholder attribute to value.
+//
+// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
+// If not specified, defaults to "%s"
+func StringFormatPlaceholder(value string) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["placeholder"] = value
+	}
+}
+
+// StringFormatSummarize sets the optional summarize attribute to value.
+//
+// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
+// If not specified, defaults to 3
+func StringFormatSummarize(value int64) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Formats a string template using a list of tensors.
+//
+// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+//
+// Arguments:
+//	inputs: The list of tensors to format into the placeholder string.
+//
+// Returns = The resulting string scalar.
+func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringFormat",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Applies sparse addition to `input` using individual values or slices
+//
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
+//
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
+
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+//
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
+// If not specified, defaults to -1
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+	return func(m optionalAttr) {
+		m["maxsplit"] = value
+	}
+}
+
+// Split elements of `source` based on `sep` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
+//
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
+// ```
+//
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
+//
+// Note that the above mentioned behavior matches python's str.split.
+//
+// Arguments:
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplitV2",
+		Input: []tf.Input{
+			input, sep,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D real-valued fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
+
+// SubstrUnit sets the optional unit attribute to value.
+//
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
+	}
+}
+
+// Return substrings from `Tensor` of strings.
+//
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// A negative `pos` indicates distance within the string backwards from the end.
+//
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Substr",
+		Input: []tf.Input{
+			input, pos, len,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
+type OutfeedDequeueTupleAttr func(optionalAttr)
+
+// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Retrieve multiple values from the computation outfeed.
+//
+// This operation will block indefinitely until data is available. Output `i`
+// corresponds to XLA tuple element `i`.
+//
+// Arguments:
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be read from the outfeed.
+func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("OutfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// Determine the script codes of a given tensor of Unicode integer code points.
+//
+// This operation converts Unicode code points to script codes corresponding to
+// each code point. Script codes correspond to International Components for
+// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+// match input shape.
+//
+// Arguments:
+//	input: A Tensor of int32 Unicode code points.
+//
+// Returns A Tensor of int32 script codes corresponding to each input code point.
+func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeScript",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutexV2Attr is an optional argument to MutexV2.
+type MutexV2Attr func(optionalAttr)
+
+// MutexV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this variable is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutexV2Container(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutexV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this variable is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func MutexV2SharedName(value string) MutexV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a Mutex resource that can be locked by `MutexLock`.
+//
+// Returns The mutex resource.
+func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given table.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InTopKV2",
+		Input: []tf.Input{
+			predictions, targets, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalNot",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformInt",
+		Input: []tf.Input{
+			shape, seed, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+//
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, serialized_summary_metadata,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighbor",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 2D real-valued fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
+//
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 5,  5, 5, 5],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentSum",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A container for an iterator resource.
+//
+// Returns A handle to the iterator that can be passed to a "MakeIterator" or
+// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
+// resource sharing by name, and does not keep a reference to the resource
+// container.
+func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "AnonymousIterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RaggedRangeAttr is an optional argument to RaggedRange.
+type RaggedRangeAttr func(optionalAttr)
+
+// RaggedRangeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
+//
+//
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
+//
+// ```python
+// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// >>> print result.eval().tolist()
+// [[2],               # result[0] = range(2, 3)
+//  [],                # result[1] = range(5, 5)
+//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
+// ```
+//
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
+//
+// Arguments:
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
+//
+// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedRange",
+		Input: []tf.Input{
+			starts, limits, deltas,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
+
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalAvgPool function.
+//
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
+//
+// Arguments:
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatefulUniformAttr is an optional argument to StatefulUniform.
+type StatefulUniformAttr func(optionalAttr)
+
+// StatefulUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulUniformDtype(value tf.DataType) StatefulUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
+//
+// Returns Random values with specified shape.
+func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulUniform",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
+//
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
+type ExperimentalParseExampleDatasetAttr func(optionalAttr)
+
+// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
+// If not specified, defaults to false
+func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
+	return func(m optionalAttr) {
+		m["sloppy"] = value
+	}
+}
+
+// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
+//
+// Arguments:
+//
+//
+//	dense_defaults: A dict mapping string keys to `Tensor`s.
+// The keys of the dict must match the dense_keys of the feature.
+//	sparse_keys: A list of string keys in the examples features.
+// The results for these keys will be returned as `SparseTensor` objects.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples features associated with dense values.
+//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+// and `tf.string` (`BytesList`) are supported.
+//	dense_shapes: List of tuples with the same length as `dense_keys`.
+// The shape of the data for each dense feature referenced by `dense_keys`.
+// Required for any input tensors identified by `dense_keys`.  Must be
+// either fully defined, or may contain an unknown first dimension.
+// An unknown first dimension means the feature is treated as having
+// a variable number of blocks, and the output shape along this dimension
+// is considered unknown at graph build time.  Padding is applied for
+// minibatch elements smaller than the maximum number of blocks for the
+// given feature along this dimension.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalParseExampleDataset",
+		Input: []tf.Input{
+			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Produces the max pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMaxPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Qr",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//	colors: 2-D. A list of RGBA colors to cycle through for the boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxesV2(scope *Scope, images tf.Output, boxes tf.Output, colors tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxesV2",
+		Input: []tf.Input{
+			images, boxes, colors,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// L2 Loss.
+//
+// Computes half the L2 norm of a tensor without the `sqrt`:
+//
+//     output = sum(t ** 2) / 2
+//
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
+//
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "L2Loss",
+		Input: []tf.Input{
+			t,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_min(c, tf.constant([0, 0, 1]))
+// # ==> [[1, 2, 2, 1],
+// #      [5, 6, 7, 8]]
+// ```
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMin",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Writes the given dataset to the given file using the TFRecord format.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//
+// Returns the created operation.
+func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetToTFRecord",
+		Input: []tf.Input{
+			input_dataset, filename, compression_type,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Serializes the tree ensemble to a proto.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.Serialized proto of the ensemble.
+func BoostedTreesSerializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, tree_ensemble_serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesSerializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Assigns sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterUpdate",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SpaceToDepthAttr is an optional argument to SpaceToDepth.
+type SpaceToDepthAttr func(optionalAttr)
+
+// SpaceToDepthDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// SpaceToDepth for tensors of type T.
+//
+// Rearranges blocks of spatial data, into depth. More specifically,
+// this op outputs a copy of the input tensor where values from the `height`
+// and `width` dimensions are moved to the `depth` dimension.
+// The attr `block_size` indicates the input block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` are rearranged
+//     into depth at each location.
+//   * The depth of the output tensor is `block_size * block_size * input_depth`.
+//   * The Y, X coordinates within each block of the input become the high order
+//     component of the output channel index.
+//   * The input tensor's height and width must be divisible by block_size.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+//                         within the output image, bX, bY means coordinates
+//                         within the input block, iC means input channels).
+//      The output would be a transpose to the following layout:
+//      n,oY,oX,bY,bX,iC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1], [2]],
+//       [[3], [4]]]]
+// ```
+//
+// This operation will output a tensor of shape `[1, 1, 1, 4]`:
+//
+// ```
+// [[[[1, 2, 3, 4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+// the corresponding output will have a single element (i.e. width and height are
+// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+// The output element shape is `[1, 1, 4]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// This operation, for block_size of 2, will return the following tensor of shape
+// `[1, 1, 1, 12]`
+//
+// ```
+// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [5],  [6]],
+//       [[3],   [4],  [7],  [8]],
+//       [[9],  [10], [13],  [14]],
+//       [[11], [12], [15],  [16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 2 2 4]`:
+//
+// ```
+// x = [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block.
+func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"block_size": block_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SpaceToDepth",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+//
+// Attributes `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgs",
+		Input: []tf.Input{
+			inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PlaceholderAttr is an optional argument to Placeholder.
+type PlaceholderAttr func(optionalAttr)
+
+// PlaceholderShape sets the optional shape attribute to value.
+//
+// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
+// shape is unconstrained.
+// If not specified, defaults to <unknown_rank:true >
+func PlaceholderShape(value tf.Shape) PlaceholderAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// A placeholder op for a value that will be fed into the computation.
+//
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Placeholder",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // 3D real-valued fast Fourier transform.
 //
 // Computes the 3-dimensional discrete Fourier transform of a real-valued signal
@@ -21156,47 +28970,308 @@ func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Outp
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
+
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// Applies set operation along last dimension of 2 `Tensor` inputs.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DenseToDenseSetOperation",
+		Input: []tf.Input{
+			set1, set2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erfc",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
+type UniformCandidateSamplerAttr func(optionalAttr)
+
+// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Real-valued fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+//
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+	return func(m optionalAttr) {
+		m["delete_old_dirs"] = value
+	}
+}
+
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
 //
 // Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21205,58 +29280,888 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns true if and only if the given Optional variant has a value.
-func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
+// Returns the number of records this Reader has produced.
+//
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OptionalHasValue",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			optional,
+			reader_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
+type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// N is the size of the segment being reduced.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters.
 //
-// See `tf.sparse.segment_sum` for usage examples.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingFTRLParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
+//
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
 //
 // Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "SparseDenseCwiseAdd",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// QuantizedDepthwiseConv2DWithBiasAttr is an optional argument to QuantizedDepthwiseConv2DWithBias.
+type QuantizedDepthwiseConv2DWithBiasAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DWithBiasOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QINT32
+func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBias(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2DWithBias",
+		Input: []tf.Input{
+			input, filter, bias, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Asserts that the given condition is true.
+//
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
+//
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
+//
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Assert",
+		Input: []tf.Input{
+			condition, tf.OutputList(data),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionWithOverlaps",
+		Input: []tf.Input{
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Updates the table to associates keys with values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableInsertV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Dequantize the 'input' tensor into a float Tensor.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// if T == qint8: in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Dequantize",
+		Input: []tf.Input{
+			input, min_range, max_range,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes a range that covers the actual values present in a quantized tensor.
+//
+// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+// range that covers the actual values present in that tensor. This op is typically
+// used to produce the `requested_output_min` and `requested_output_max` for
+// `Requantize`.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
+//
+// The polygamma function is defined as:
+//
+//
+// \\(\psi^{(a)}(x) = \frac{d^a}{dx^a} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+// The polygamma function is defined only for non-negative integer orders \\a\\.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Polygamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedConv2DPerChannelAttr is an optional argument to QuantizedConv2DPerChannel.
+type QuantizedConv2DPerChannelAttr func(optionalAttr)
+
+// QuantizedConv2DPerChannelOutType sets the optional out_type attribute to value.
+//
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChannelAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
+//
+// value: list of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes QuantizedConv2D per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	min_input: The minimum value of the input tensor
+//	max_input: The maximum value of the input tensor.
+//	min_filter: The minimum value of the filter tensor.
+//	max_filter: The maximum value of the filter tensor.
+//	strides: list of stride values.
+//
+//
+// Returns The output tensor.The minimum value of the final output tensor.The maximum value of the final output tensor.
+func QuantizedConv2DPerChannel(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DPerChannelAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2DPerChannel",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedDepthwiseConv2DAttr is an optional argument to QuantizedDepthwiseConv2D.
+type QuantizedDepthwiseConv2DAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QINT32
+func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
+
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize.
+type QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr func(optionalAttr)
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType sets the optional out_type attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_QUINT8
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
+//
+// value: List of dilation values.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes quantized depthwise Conv2D with Bias, Relu and Requantize.
+//
+// Arguments:
+//	input: The original input tensor.
+//	filter: The original filter tensor.
+//	bias: The original bias tensor.
+//	min_input: The float value that the minimum quantized input value represents.
+//	max_input: The float value that the maximum quantized input value represents.
+//	min_filter: The float value that the minimum quantized filter value represents.
+//	max_filter: The float value that the maximum quantized filter value represents.
+//	min_freezed_output: The minimum float value of the output tensor.
+//	max_freezed_output: The maximum float value of the output tensor.
+//	strides: List of stride values.
+//
+//
+// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
+		Input: []tf.Input{
+			input, filter, bias, min_input, max_input, min_filter, max_filter, min_freezed_output, max_freezed_output,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastSend",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a tensor containing the reduction across all input tensors.
+//
+// Outputs a tensor containing the reduction across all input tensors passed to ops
+// within the same `shared_name.
+//
+// The graph should be constructed so if one op runs with shared_name value `c`,
+// then `num_devices` ops will run with shared_name value `c`.  Failure to do so
+// will cause the graph execution to fail to complete.
+//
+// input: the input to the reduction
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+// num_devices: The number of devices participating in this reduction.
+// shared_name: Identifier that shared between ops of the same reduction.
+func NcclAllReduce(scope *Scope, input tf.Output, reduction string, num_devices int64, shared_name string) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"reduction": reduction, "num_devices": num_devices, "shared_name": shared_name}
+	opspec := tf.OpSpec{
+		Type: "NcclAllReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the item in the list with the given index.
+//
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Calculates the prior from the training data (the bias) and fills in the first node with the logits' prior. Returns a boolean indicating whether to continue centering.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	mean_gradients: A tensor with shape=[logits_dimension] with mean of gradients for a first node.
+//	mean_hessians: A tensor with shape=[logits_dimension] mean of hessians for a first node.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//
+// Returns Bool, whether to continue bias centering.
+func BoostedTreesCenterBias(scope *Scope, tree_ensemble_handle tf.Output, mean_gradients tf.Output, mean_hessians tf.Output, l1 tf.Output, l2 tf.Output) (continue_centering tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCenterBias",
+		Input: []tf.Input{
+			tree_ensemble_handle, mean_gradients, mean_hessians, l1, l2,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Sends `input` to all devices that are connected to the output.
+//
+// Sends `input` to all devices that are connected to the output.
+//
+// The graph should be constructed so that all ops connected to the output have a
+// valid device assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the broadcast.
+// output: The same as input.
+// shape: The shape of the input tensor.
+//
+func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "NcclBroadcast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RandomGammaAttr is an optional argument to RandomGamma.
 type RandomGammaAttr func(optionalAttr)
 
@@ -21316,743 +30221,17 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
-type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load RMSProp embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the RMSProp optimization algorithm.
-//	ms: Value of ms used in the RMSProp optimization algorithm.
-//	mom: Value of mom used in the RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingRMSPropParameters",
-		Input: []tf.Input{
-			parameters, ms, mom,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
-		Input: []tf.Input{
-			alpha, sample,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
-//
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The PNG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodePng",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Multinomial",
-		Input: []tf.Input{
-			logits, num_samples,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 2D fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+// Returns which elements of x are Inf.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.fft2
+// Equivalent to np.isinf
 // @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT2D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
-
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["update_slots"] = value
-	}
-}
-
-// Update '*var' according to the adagrad scheme.
-//
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
-
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
-//
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
-	return func(m optionalAttr) {
-		m["replace_global"] = value
-	}
-}
-
-// Replaces the match of pattern in input with rewrite.
-//
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expression.
-//
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-//
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReshape",
-		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Connects outputs of an N-way replicated computation to N outputs.
-func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_replicas": num_replicas}
-	opspec := tf.OpSpec{
-		Type: "TPUReplicatedOutput",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("TPUReplicatedOutput", err)
-		return
-	}
-	return outputs
-}
-
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
-
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
-	return func(m optionalAttr) {
-		m["compute_v"] = value
-	}
-}
-
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
-//
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
-// are sorted in non-decreasing order.
-//
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
-//
-// Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
-//
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
-//
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
-//
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
-		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
-
-// RpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// RpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
-//
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
-//
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
-//
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Rpc",
-		Input: []tf.Input{
-			address, method, request,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "IsInf",
 		Input: []tf.Input{
 			x,
 		},
@@ -22061,18 +30240,108 @@ func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// Creates a dataset that changes the batch size.
+//
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_workers.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
+//
+//
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRebatchDataset",
+		Input: []tf.Input{
+			input_dataset, num_workers,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
 //
 // value: 1-D tensor of length 4.  The dilation factor for each dimension of
 // `input`. If set to k > 1, there will be k-1 skipped cells between each
@@ -22080,32 +30349,49 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
 // If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
+//
+// In detail, with the default NHWC format,
+//
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
 // Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22114,109 +30400,66 @@ func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "Conv2D",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
 
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
 //
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adadelta parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, updates, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
-
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["adj_x"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22225,48 +30468,160 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
+type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingProximalAdagradParameters",
+		Input: []tf.Input{
+			parameters, accumulators,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// PrelinearizeTupleAttr is an optional argument to PrelinearizeTuple.
-type PrelinearizeTupleAttr func(optionalAttr)
-
-// PrelinearizeTupleLayouts sets the optional layouts attribute to value.
+// Computes gradients for SparseSegmentMean.
 //
-// value: A vector holding the requested layout in minor-to-major sequence for all the
-// tuple shapes in the order the shapes appear in the "shapes" input. The layout
-// elements for a sub-shape can be set to -1 in which case the corresponding layout
-// will be computed by the infeed operation.
-// If not specified, defaults to <>
-func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
-	return func(m optionalAttr) {
-		m["layouts"] = value
-	}
-}
-
-// An op which linearizes multiple Tensor values to an opaque variant tensor.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	inputs: A list of tensors that will be provided using the infeed mechanism.
-//	shapes: The shapes of each tensor in `inputs`.
-func PrelinearizeTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...PrelinearizeTupleAttr) (output tf.Output) {
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchMatMulV2Attr is an optional argument to BatchMatMulV2.
+type BatchMatMulV2Attr func(optionalAttr)
+
+// BatchMatMulV2AdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulV2AdjX(value bool) BatchMatMulV2Attr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulV2AdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulV2AdjY(value bool) BatchMatMulV2Attr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// *NOTE*: `BatchMatMulV2` supports broadcasting in the batch dimensions. More
+// about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+//
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMulV2(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PrelinearizeTuple",
+		Type: "BatchMatMulV2",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -22274,6 +30629,106 @@ func PrelinearizeTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, opti
 	return op.Output(0)
 }
 
+// Creates a dataset that emits the records from one or more TFRecord files.
+//
+// Arguments:
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TFRecordDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
+
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
+	return func(m optionalAttr) {
+		m["Truncate"] = value
+	}
+}
+
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"DstT": DstT}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cast",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves input tensors slices to disk.
+//
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveSlices",
+		Input: []tf.Input{
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // SparseMatMulAttr is an optional argument to SparseMatMul.
 type SparseMatMulAttr func(optionalAttr)
 
@@ -22339,79 +30794,118 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 	return op.Output(0)
 }
 
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
-type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
+// Generate a sharded filename. The filename is printf formatted as
 //
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adagrad embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, accumulators, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AddV2",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			x, y,
+			basename, shard, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// Computes the absolute value of a tensor.
+//
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
+//
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComplexAbs",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Updates the tree ensemble by either adding a layer to the last tree being grown
+//
+// or by starting a new tree.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesUpdateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
 // true, this follows Python semantics in that the result here is consistent
@@ -22433,6 +30927,5826 @@ func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Computes numerical negative value element-wise.
+//
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Neg",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reciprocal",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Subtracts sparse `updates` from an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by subtracting sparse `updates` from the
+// passed in `tensor`.
+// This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+// are subtracted from an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_sub is to subtract individual elements
+// from a tensor by index. For example, say we want to insert 4 scattered elements
+// in a rank-1 tensor with 8 elements.
+//
+// In Python, this scatter subtract operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, -10, 1, -9, -8, 1, 1, -11]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates subtracted according to the indices.
+func TensorScatterSub(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterSub",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `SparseTensor` along the specified dimension.
+//
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	opspec := tf.OpSpec{
+		Type: "SparseConcat",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes reciprocal of square root of x element-wise.
+//
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Rounds the values of a tensor to the nearest integer, element-wise.
+//
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the rsqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RsqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+//
+// Arguments:
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
+//
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompareAndBitpack",
+		Input: []tf.Input{
+			input, threshold,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeNearestNeighborGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradHalfPixelCenters(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighborGrad",
+		Input: []tf.Input{
+			grads, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that batches input elements into a SparseTensor.
+//
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
+//
+//
+func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Bessel i0e function of `x` element-wise.
+//
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+//
+// This function is faster and numerically stabler than `bessel_i0(x)`.
+func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BesselI0e",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArraySplitV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ReadVariableOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
+
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradBoxes",
+		Input: []tf.Input{
+			grads, image, boxes, box_ind,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns an element-wise indication of the sign of a number.
+//
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sign",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Floor",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
+
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
+//
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns 0 if the denominator is zero.
+//
+//
+// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DivNoNan",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Add",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+//
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
+	}
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
+	}
+}
+
+// Initializes a table from a text file.
+//
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
+//
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableFromTextFileV2",
+		Input: []tf.Input{
+			table_handle, filename,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+//     Subtracts `v` into specified rows of `x`.
+//
+//     Computes y = x; y[i, :] -= v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceSub",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
+//
+// Arguments:
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter `updates` into an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by applying sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd`, except that the updates are
+// scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_update(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 11, 1, 10, 9, 1, 1, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_update(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func TensorScatterUpdate(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterUpdate",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+//
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
+//
+// Our Conv3D implements a form of cross-correlation.
+//
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x * y element-wise.
+//
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Updates specified rows with values in `v`.
+//
+//     Computes `x[i, :] = v; return x`.
+//
+// Arguments:
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceUpdate",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Serializes the tree handle to a proto
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be serialized.
+//
+// Returns Serialied proto string of the tree resource.
+func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSerialize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// A container for an iterator resource.
+//
+// Returns A handle to the iterator that can be passed to a "MakeIterator" or
+// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
+// resource sharing by name, and does not keep a reference to the resource
+// container.A variant deleter that should be passed into the op that deletes the iterator.
+func AnonymousIteratorV2(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output, deleter tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "AnonymousIteratorV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns x // y element-wise.
+//
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for integer types.
+//
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
+//
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextIteration",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
+//	value: The tensor to split.
+//	size_splits: list containing the sizes of each output tensor along the split
+// dimension. Must sum to the dimension of value along split_dim.
+// Can contain one -1 indicating that dimension is to be inferred.
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//
+//
+// Returns Tensors whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `size_splits[i]`.
+func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SplitV",
+		Input: []tf.Input{
+			value, size_splits, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("SplitV", err)
+		return
+	}
+	return output
+}
+
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
+
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// Arguments:
+//	contents: 0-D.  The BMP-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBmp",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
+func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xlogy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
+
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the size of a tensor.
+//
+// This operation returns an integer representing the number of elements in
+// `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Size",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+//
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IgammaGradA",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
+//
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
+//
+// Arguments:
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParameterizedTruncatedNormal",
+		Input: []tf.Input{
+			shape, means, stdevs, minvals, maxvals,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxes",
+		Input: []tf.Input{
+			images, boxes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x < y) element-wise.
+//
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Less",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x <= y) element-wise.
+//
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LessEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a list of tensors with the same shapes and contents as the input
+//
+// tensors.
+//
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
+}
+
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+//
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+	return func(m optionalAttr) {
+		m["cancel_pending_enqueues"] = value
+	}
+}
+
+// Closes the given queue.
+//
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"reduction": reduction}
+	opspec := tf.OpSpec{
+		Type: "NcclReduce",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x >= y) element-wise.
+//
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softplus gradients for a softplus operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
+//
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftplusGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ApproximateEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Selects elements from `x` or `y`, depending on `condition`.
+//
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
+//
+// Arguments:
+//
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Select",
+		Input: []tf.Input{
+			condition, x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
+
+// SumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Sum",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prod",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the minimum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Min",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
+
+// MaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the maximum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Max",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_mean(c, tf.constant([0, 0, 1]))
+// # ==> [[2.5, 2.5, 2.5, 2.5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMean",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_max(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 3, 3, 4],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
+//
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  3, 3, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMax",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the minimum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
+//
+// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 1,  2, 2, 1],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentMin",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSum",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TridiagonalSolveAttr is an optional argument to TridiagonalSolve.
+type TridiagonalSolveAttr func(optionalAttr)
+
+// TridiagonalSolvePartialPivoting sets the optional partial_pivoting attribute to value.
+//
+// value: Whether to apply partial pivoting. Partial pivoting makes the procedure more
+// stable, but slower.
+// If not specified, defaults to true
+func TridiagonalSolvePartialPivoting(value bool) TridiagonalSolveAttr {
+	return func(m optionalAttr) {
+		m["partial_pivoting"] = value
+	}
+}
+
+// Solves tridiagonal systems of equations.
+//
+//   Solves tridiagonal systems of equations.
+//   Supports batch dimensions and multiple right-hand sides per each left-hand
+//   side.
+//   On CPU, solution is computed via Gaussian elimination with or without partial
+//   pivoting, depending on `partial_pivoting` attribute. On GPU, Nvidia's cuSPARSE
+//   library is used: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv
+//
+// Arguments:
+//	diagonals: Tensor of shape `[..., 3, M]` whose innermost 2 dimensions represent the
+// tridiagonal matrices with three rows being the superdiagonal, diagonals, and
+// subdiagonals, in order. The last element of the superdiagonal and the first
+// element of the subdiagonal is ignored.
+//	rhs: Tensor of shape `[..., M, K]`, representing K right-hand sides per each
+// left-hand side.
+//
+// Returns Tensor of shape `[..., M, K]` containing the solutions
+func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional ...TridiagonalSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TridiagonalSolve",
+		Input: []tf.Input{
+			diagonals, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// See `tf.sparse.segment_sum` for usage examples.
+//
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMean",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// element_shape: The shape of the uninitialized elements in the list. If the first
+//   dimension is not -1, it is assumed that all list elements have the same
+//   leading dim.
+// leading_dims: The list of leading dims of uninitialized list elements. Used if
+//   the leading dim of input_handle.element_shape or the element_shape input arg
+//   is not already set.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcatV2",
+		Input: []tf.Input{
+			input_handle, element_shape, leading_dims,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
+
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// See `tf.sparse.segment_sum` for usage examples.
+//
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtN",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a sequence of numbers.
+//
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
+//
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
+//
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Range",
+		Input: []tf.Input{
+			start, limit, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the real part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Real",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
+
+// HashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// HashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
+//
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the complex conjugate of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
+//
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Conj",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
+//
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedResizeAndPadConv2D",
+		Input: []tf.Input{
+			input, size, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
+//
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolveLs",
+		Input: []tf.Input{
+			matrix, rhs, l2_regularizer,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayReadV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV2",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrefetchDatasetAttr is an optional argument to PrefetchDataset.
+type PrefetchDatasetAttr func(optionalAttr)
+
+// PrefetchDatasetSlackPeriod sets the optional slack_period attribute to value.
+// If not specified, defaults to 0
+func PrefetchDatasetSlackPeriod(value int64) PrefetchDatasetAttr {
+	return func(m optionalAttr) {
+		m["slack_period"] = value
+	}
+}
+
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+//
+// Arguments:
+//
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
+//
+//
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...PrefetchDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrefetchDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Slice a `SparseTensor` based on the `start` and `size`.
+//
+// For example, if the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSlice",
+		Input: []tf.Input{
+			indices, values, shape, start, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
+
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Return histogram of values.
+//
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
+//
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+//
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
+//
+// Arguments:
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramFixedWidth",
+		Input: []tf.Input{
+			values, value_range, nbins,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+//
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
+//
+// Arguments:
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMul",
+		Input: []tf.Input{
+			a, b, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
+//
+// Arguments:
+//
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMul",
+		Input: []tf.Input{
+			x, y, min_x, max_x, min_y, max_y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
+
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
+	return func(m optionalAttr) {
+		m["output_idx_type"] = value
+	}
+}
+
+// Computes the LU decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be invertible.
+//
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
+//
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
+//
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
+//
+// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Lu",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x + y element-wise, working on quantized buffers.
+//
+// Arguments:
+//
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedAdd",
+		Input: []tf.Input{
+			x, y, min_x, max_x, min_y, max_y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// CombinedNonMaxSuppressionAttr is an optional argument to CombinedNonMaxSuppression.
+type CombinedNonMaxSuppressionAttr func(optionalAttr)
+
+// CombinedNonMaxSuppressionPadPerClass sets the optional pad_per_class attribute to value.
+//
+// value: If false, the output nmsed boxes, scores and classes
+// are padded/clipped to `max_total_size`. If true, the
+// output nmsed boxes, scores and classes are padded to be of length
+// `max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
+// which case it is clipped to `max_total_size`. Defaults to false.
+// If not specified, defaults to false
+func CombinedNonMaxSuppressionPadPerClass(value bool) CombinedNonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["pad_per_class"] = value
+	}
+}
+
+// CombinedNonMaxSuppressionClipBoxes sets the optional clip_boxes attribute to value.
+//
+// value: If true, assume the box coordinates are between [0, 1] and clip the output boxes
+// if they fall beyond [0, 1]. If false, do not do clipping and output the box
+// coordinates as it is.
+// If not specified, defaults to true
+func CombinedNonMaxSuppressionClipBoxes(value bool) CombinedNonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["clip_boxes"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// This operation performs non_max_suppression on the inputs per batch, across
+// all classes.
+// Prunes away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system. Also note that
+// this algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is the final boxes, scores and classes tensor
+// returned after performing non_max_suppression.
+//
+// Arguments:
+//	boxes: A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then
+// same boxes are used for all classes otherwise, if `q` is equal to number of
+// classes, class-specific boxes are used.
+//	scores: A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]`
+// representing a single score corresponding to each box (each row of boxes).
+//	max_output_size_per_class: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression per class
+//	max_total_size: A scalar representing maximum number of boxes retained over all classes.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A [batch_size, max_detections, 4] float32 tensor
+// containing the non-max suppressed boxes.A [batch_size, max_detections] float32 tensor
+// containing the scores for the boxes.A [batch_size, max_detections] float32 tensor
+// containing the classes for the boxes.A [batch_size] int32 tensor indicating the number of
+// valid detections per batch item. Only the top num_detections[i] entries in
+// nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
+// entries are zero paddings.
+func CombinedNonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size_per_class tf.Output, max_total_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...CombinedNonMaxSuppressionAttr) (nmsed_boxes tf.Output, nmsed_scores tf.Output, nmsed_classes tf.Output, valid_detections tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CombinedNonMaxSuppression",
+		Input: []tf.Input{
+			boxes, scores, max_output_size_per_class, max_total_size, iou_threshold, score_threshold,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Converts the quantized `input` tensor into a lower-precision `output`.
+//
+// Converts the quantized `input` tensor into a lower-precision `output`, using the
+// output range specified with `requested_output_min` and `requested_output_max`.
+//
+// `[input_min, input_max]` are scalar floats that specify the range for the float
+// interpretation of the `input` data. For example, if `input_min` is -1.0f and
+// `input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "Requantize",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
+
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAddSign",
+		Input: []tf.Input{
+			var_, m, lr, alpha, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Bucketizes 'input' based on 'boundaries'.
+//
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
+//
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
+	opspec := tf.OpSpec{
+		Type: "Bucketize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
+
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+//
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_types"] = value
+	}
+}
+
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+//
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Converts the given string representing a handle to an iterator to a resource.
+//
+// Arguments:
+//	string_handle: A string representation of the given handle.
+//
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorFromStringHandle",
+		Input: []tf.Input{
+			string_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	num_threads: Identifies the number of threads to use for the private threadpool.
+//
+//
+func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalPrivateThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, num_threads,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
+type RequantizePerChannelAttr func(optionalAttr)
+
+// RequantizePerChannelOutType sets the optional out_type attribute to value.
+//
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QUINT8
+func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Requantizes input with min and max values known per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	requested_output_min: The minimum value of the output tensor requested.
+//	requested_output_max: The maximum value of the output tensor requested.
+//
+// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes requantization range per channel.
+//
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	clip_value_max: The maximum value of the output that needs to be clipped.
+// Example: set this to 6 for Relu6.
+//
+// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRangePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
+//
+// This operation returns the same result as the C++ std::nextafter function.
+//
+// It can also return a subnormal number.
+//
+// @compatibility(cpp)
+// Equivalent to C++ std::nextafter function.
+// @end_compatibility
+func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextAfter",
+		Input: []tf.Input{
+			x1, x2,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
+
+// AsStringPrecision sets the optional precision attribute to value.
+//
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
+//
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["scientific"] = value
+	}
+}
+
+// AsStringShortest sets the optional shortest attribute to value.
+//
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["shortest"] = value
+	}
+}
+
+// AsStringWidth sets the optional width attribute to value.
+//
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
+//
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
+	return func(m optionalAttr) {
+		m["fill"] = value
+	}
+}
+
+// Converts each entry in the given tensor to strings.  Supports many numeric
+//
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AsString",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Looks up keys in a table, outputs the corresponding values.
+//
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
+//
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableFindV2",
+		Input: []tf.Input{
+			table_handle, keys, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// MaxPoolWithArgmaxIncludeBatchInIndex sets the optional include_batch_in_index attribute to value.
+//
+// value: Whether to include batch dimension in flattened index of `argmax`.
+// If not specified, defaults to false
+func MaxPoolWithArgmaxIncludeBatchInIndex(value bool) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["include_batch_in_index"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
+//
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index:
+// `(y * width + x) * channels + c` if `include_batch_in_index` is False;
+// `((b * height + y) * width + x) * channels + c` if `include_batch_in_index` is True.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolWithArgmax",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
+	}
+}
+
+// Creates an empty hash table that uses tensors as the backing store.
+//
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableDenseHashTableV2",
+		Input: []tf.Input{
+			empty_key, deleted_key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
+//
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
+	}
+}
+
+// PrintSummarize sets the optional summarize attribute to value.
+//
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
+//
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
+//
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Print",
+		Input: []tf.Input{
+			input, tf.OutputList(data),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// PrintV2Attr is an optional argument to PrintV2.
+type PrintV2Attr func(optionalAttr)
+
+// PrintV2OutputStream sets the optional output_stream attribute to value.
+//
+// value: A string specifying the output stream or logging level to print to.
+// If not specified, defaults to "stderr"
+func PrintV2OutputStream(value string) PrintV2Attr {
+	return func(m optionalAttr) {
+		m["output_stream"] = value
+	}
+}
+
+// PrintV2End sets the optional end attribute to value.
+// If not specified, defaults to "\n"
+func PrintV2End(value string) PrintV2Attr {
+	return func(m optionalAttr) {
+		m["end"] = value
+	}
+}
+
+// Prints a string scalar.
+//
+// Prints a string scalar to the desired output_stream.
+//
+// Arguments:
+//	input: The string scalar to print.
+//
+// Returns the created operation.
+func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "PrintV2",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+//
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
+//
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CholeskyGrad",
+		Input: []tf.Input{
+			l, grad,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BoostedTreesQuantileStreamResourceHandleOpAttr is an optional argument to BoostedTreesQuantileStreamResourceHandleOp.
+type BoostedTreesQuantileStreamResourceHandleOpAttr func(optionalAttr)
+
+// BoostedTreesQuantileStreamResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesQuantileStreamResourceHandleOpContainer(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BoostedTreesQuantileStreamResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesQuantileStreamResourceHandleOpSharedName(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a BoostedTreesQuantileStreamResource.
+func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...BoostedTreesQuantileStreamResourceHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with scalar values.
+//
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScalarSummary",
+		Input: []tf.Input{
+			tags, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
+
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse addition to individual values or slices in a Variable.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdAdd",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramSummary",
+		Input: []tf.Input{
+			tag, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tensor filled with a scalar value.
+//
+// This operation creates a tensor of shape `dims` and fills it with `value`.
+//
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
+//
+// `tf.fill` differs from `tf.constant` in a few ways:
+//
+// *   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
+//     Tensor values.
+// *   `tf.fill` creates an Op in the computation graph that constructs the actual
+//     Tensor value at runtime. This is in contrast to `tf.constant` which embeds
+//     the entire Tensor into the graph with a `Const` node.
+// *   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
+//     based on other runtime Tensors, unlike `tf.constant`.
+//
+// Arguments:
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fill",
+		Input: []tf.Input{
+			dims, value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Provides the time since epoch in seconds.
+//
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Timestamp",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues zero or more tuples of one or more tensors in the given queue.
+//
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
+//
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
+//
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueEnqueueManyV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the number of tensors in the input tensor list.
+//
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListLength",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
+type OutfeedDequeueAttr func(optionalAttr)
+
+// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Retrieves a single tensor from the computation outfeed.
+//
+// This operation will block indefinitely until data is available.
+//
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be read from the device outfeed.
+func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
+	return func(m optionalAttr) {
+		m["num_elements"] = value
+	}
+}
+
+// Stacks all tensors in the list.
+//
+// Requires that all tensors have the same shape.
+//
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListStack",
+		Input: []tf.Input{
+			input_handle, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorList which, when stacked, has the value of `tensor`.
+//
+// Each tensor in the result list corresponds to one row of the input tensor.
+//
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListFromTensor",
+		Input: []tf.Input{
+			tensor, element_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIgnoreErrorsDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The shape of the elements of the given list, as a tensor.
+//
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
+	opspec := tf.OpSpec{
+		Type: "TensorListElementShape",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorSliceDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Add all input tensors element wise.
+//
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddN",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
+//
+// Arguments:
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
+//
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
+	opspec := tf.OpSpec{
+		Type: "ResourceCountUpTo",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Resizes the list.
+//
+//
+// input_handle: the input list
+// size: size of the output list
+//
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListResize",
+		Input: []tf.Input{
+			input_handle, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a Tensor by indexing into the TensorList.
+//
+// Each row in the produced Tensor corresponds to the element in the TensorList
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The input tensor list.
+// indices: The indices used to index into the list.
+// values: The tensor.
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGather",
+		Input: []tf.Input{
+			input_handle, indices, element_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+//
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+//
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppression",
+		Input: []tf.Input{
+			boxes, scores, max_output_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorList by indexing into a Tensor.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// output_handle: The TensorList.
+func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatter",
+		Input: []tf.Input{
+			tensor, indices, element_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a TensorList by indexing into a Tensor.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// num_elements: The size of the output list. Must be large enough to accommodate
+//   the largest index in indices. If -1, the list is just large enough to include
+//   the largest index in indices.
+// output_handle: The TensorList.
+func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterV2",
+		Input: []tf.Input{
+			tensor, indices, element_shape, num_elements,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Scatters tensor at indices in an input list.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The list to scatter into.
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// output_handle: The TensorList.
+func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterIntoExistingList",
+		Input: []tf.Input{
+			input_handle, tensor, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated, use python implementation tf.linalg.matrix_exponential.
+//
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixExponential",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cholesky",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+//
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
+//
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEig",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations.
+//
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
+type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, momenta, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RepeatDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the matrix square root of one or more square matrices:
+//
+// matmul(sqrtm(A), sqrtm(A)) = A
+//
+// The input matrix should be invertible. If the input matrix is real, it should
+// have no eigenvalues which are real and negative (pairs of complex conjugate
+// eigenvalues are allowed).
+//
+// The matrix square root is computed by first reducing the matrix to
+// quasi-triangular form with the real Schur decomposition. The square root
+// of the quasi-triangular matrix is then computed directly. Details of
+// the algorithm can be found in: Nicholas J. Higham, "Computing real
+// square roots of a real matrix", Linear Algebra Appl., 1987.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the matrix square root for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.sqrtm
+// @end_compatibility
+func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSquareRoot",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Calculate product with tridiagonal matrix.
+//
+// Calculates product of two matrices, where left matrix is a tridiagonal matrix.
+//
+// Arguments:
+//	superdiag: Tensor of shape `[..., 1, M]`, representing superdiagonals of
+// tri-diagonal matrices to the left of multiplication. Last element is ingored.
+//	maindiag: Tensor of shape `[..., 1, M]`, representing main diagonals of tri-diagonal
+// matrices to the left of multiplication.
+//	subdiag: Tensor of shape `[..., 1, M]`, representing subdiagonals of tri-diagonal
+// matrices to the left of multiplication. First element is ingored.
+//	rhs: Tensor of shape `[..., M, N]`, representing MxN matrices to the right of
+// multiplication.
+//
+// Returns Tensor of shape `[..., M, N]` containing the product.
+func TridiagonalMatMul(scope *Scope, superdiag tf.Output, maindiag tf.Output, subdiag tf.Output, rhs tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TridiagonalMatMul",
+		Input: []tf.Input{
+			superdiag, maindiag, subdiag, rhs,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ConcatenateDataset",
+		Input: []tf.Input{
+			input_dataset, another_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
+type InfeedEnqueueAttr func(optionalAttr)
+
+// InfeedEnqueueShape sets the optional shape attribute to value.
+//
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["shape"] = value
+	}
+}
+
+// InfeedEnqueueLayout sets the optional layout attribute to value.
+//
+// value: A vector holding the requested layout in minor-to-major sequence.
+// If a layout attribute is passed, but its values are all -1, the layout will
+// be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["layout"] = value
+	}
+}
+
+// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which feeds a single Tensor value into the computation.
+//
+// Arguments:
+//	input: A tensor that will be provided using the infeed mechanism.
+//
+// Returns the created operation.
+func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueue",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // EncodeJpegAttr is an optional argument to EncodeJpeg.
 type EncodeJpegAttr func(optionalAttr)
 
@@ -22569,314 +36883,92 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// Returns the batched diagonal part of a batched tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the batched `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-//
-// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-//
-// The input must be at least a matrix.
-//
-// For example:
-//
-// ```
-// # 'input' is [[[1, 0, 0, 0]
-//                [0, 2, 0, 0]
-//                [0, 0, 3, 0]
-//                [0, 0, 0, 4]],
-//               [[5, 0, 0, 0]
-//                [0, 6, 0, 0]
-//                [0, 0, 7, 0]
-//                [0, 0, 0, 8]]]
-//
-// and input.shape = (2, 4, 4)
-//
-// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// which has shape (2, 4)
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor where `k >= 2`.
-//
-// Returns The extracted diagonal(s) having shape
-// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiagPart",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
 
-// Computes the determinant of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An Op to permute tensors across replicated TPU instances.
-//
-// Each instance supplies its own input.
-//
-// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
-// `[D, A, B, C]`.
-//
-// Arguments:
-//	input: The local input to be permuted. Currently only supports float and
-// bfloat16.
-//	source_target_pairs: A tensor with shape [num_pairs, 2].
-//
-// Returns The permuted input.
-func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CollectivePermute",
-		Input: []tf.Input{
-			input, source_target_pairs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// Concats all tensors in the list along the 0th dimension.
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// Requires that all tensors have the same shape except the first dimension.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
+		Type: "TensorListConcat",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
-
-// SubstrUnit sets the optional unit attribute to value.
-//
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
-//
-// Broadcasting `input` onto `pos` and `len`:
-//
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
-//
-// output = [b'hir', b'ee', b'n']
-// ```
-//
-// Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
-//
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Substr",
-		Input: []tf.Input{
-			input, pos, len,
+			input_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
 
-// VarHandleOpContainer sets the optional container attribute to value.
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// value: the container this variable is placed in.
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["skip_header_lines"] = value
+	}
+}
+
+// TextLineReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// value: the name by which this variable is referred to.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
 // If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Creates a handle to a Variable resource.
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
+		Type: "TextLineReaderV2",
 
 		Attrs: attrs,
 	}
@@ -22884,105 +36976,122 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
-//
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 4,  3, 3, 4],
-// #       [5,  6, 7, 8]]
-// ```
+// Saves tensors in V2 checkpoint format.
 //
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
 //
 // Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "SaveV2",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
 		},
 	}
+	return scope.AddOperation(opspec)
+}
+
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Restore",
+		Input: []tf.Input{
+			file_pattern, tensor_name,
+		},
+		Attrs: attrs,
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
 
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
+// var: Should be from a Variable().
 //
 // Arguments:
-//	var_: Should be from a Variable().
+//
 //	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
 //	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22991,448 +37100,60 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Shuts down a running distributed TPU system.
-//
-// The op returns an error if no system is running.
-//
-// Returns the created operation.
-func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShutdownDistributedTPU",
-	}
-	return scope.AddOperation(opspec)
-}
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// QuantizedDepthwiseConv2DWithBiasAttr is an optional argument to QuantizedDepthwiseConv2DWithBias.
-type QuantizedDepthwiseConv2DWithBiasAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasOutType sets the optional out_type attribute to value.
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_QINT32
-func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D with Bias.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBias(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBias",
-		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
-
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse addition to individual values or slices in a Variable.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that addition would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// add = tf.scatter_nd_add(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(add)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that emits the records from one or more binary files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	header_bytes: A scalar representing the number of bytes to skip at the
-// beginning of a file.
-//	record_bytes: A scalar representing the number of bytes in each record.
-//	footer_bytes: A scalar representing the number of bytes to skip at the end
-// of a file.
-//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordDataset",
-		Input: []tf.Input{
-			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
-
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
-		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
-type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
-//
-//
-//
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
-type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
 // If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
-		m["table_id"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters with debug support.
+// Restores a tensor from checkpoint files.
 //
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
-// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
-
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
-//
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
-	return func(m optionalAttr) {
-		m["skip_empty"] = value
-	}
-}
-
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
-//
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			input, delimiter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
@@ -23440,1063 +37161,20 @@ func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf
 	return op.Output(0)
 }
 
-// Returns the gradient of `Tile`.
-//
-// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
-//
-// Since `Tile` takes an input and repeats the input `multiples` times
-// along each dimension, `TileGrad` takes in `multiples` and aggregates
-// each repeated tile of `input` into `output`.
-func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TileGrad",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
 
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adadelta embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT3D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return a slice from 'input'.
-//
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
-//
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
-//
-// Arguments:
-//
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Slice",
-		Input: []tf.Input{
-			input, begin, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
-type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve RMSProp embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
-func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingRMSPropParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
-type TPUReplicateMetadataAttr func(optionalAttr)
-
-// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
-//
-// value: Number of cores per replica. Used for model parallelism.
-// If not specified, defaults to 1
-func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["num_cores_per_replica"] = value
-	}
-}
-
-// TPUReplicateMetadataTopology sets the optional topology attribute to value.
-//
-// value: TopologyProto indicating the topology of the TPU pod slice.
-// If not specified, defaults to ""
-func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["topology"] = value
-	}
-}
-
-// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
-//
-// value: Whether to place the computation on the TPU.
-// If not specified, defaults to true
-func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["use_tpu"] = value
-	}
-}
-
-// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
-//
-// value: The assignment of devices for the computation.
-// If not specified, defaults to <>
-func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["device_assignment"] = value
-	}
-}
-
-// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
-//
-// value: DEPRECATED. Use num_cores_per_replica instead.
-// If not specified, defaults to <>
-func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["computation_shape"] = value
-	}
-}
-
-// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
-// If not specified, defaults to <>
-func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["host_compute_core"] = value
-	}
-}
-
-// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
-// If not specified, defaults to <>
-func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["padding_map"] = value
-	}
-}
-
-// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
-// If not specified, defaults to "STEP_MARK_AT_ENTRY"
-func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
-	return func(m optionalAttr) {
-		m["step_marker_location"] = value
-	}
-}
-
-// Metadata indicaitng how the TPU computation should be replicated.
-//
-// Arguments:
-//	num_replicas: Number of replicas of the computation
-//
-// Returns the created operation.
-func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_replicas": num_replicas}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUReplicateMetadata",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
-
-// SqueezeAxis sets the optional axis attribute to value.
-//
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
-	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
-	}
-}
-
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
-//
-// For example:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
-//
-// Or, to remove specific size 1 dimensions:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
-//
-// Arguments:
-//	input: The `input` to squeeze.
-//
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Squeeze",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
-type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
-
-// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-// Similarly, the character start byte offsets are returned using a single vector
-// `char_to_byte_starts`, with strings expanded in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints and start offsets for
-// each input string begin and end within the `char_values` and
-// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
-// character in `char_values` starts.
-func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecodeWithOffsets",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
-type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve Adadelta embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Imag",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
-//
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ClipByValue",
-		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
-//
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
-//
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
-//
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Multiplies sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a TensorList by indexing into a Tensor.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// num_elements: The size of the output list. Must be large enough to accommodate
-//   the largest index in indices. If -1, the list is just large enough to include
-//   the largest index in indices.
-// output_handle: The TensorList.
-func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatterV2",
-		Input: []tf.Input{
-			tensor, indices, element_shape, num_elements,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
-
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
-//
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
-		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
-type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve proximal Adagrad embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
-func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
-type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
-	return func(m optionalAttr) {
-		m["max_sequence_lengths"] = value
-	}
-}
-
-// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
-//
-// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
-// to the ith feature. table_ids[i] indicates which embedding table to look up ith
-// feature.
-//
-// The tensors at corresponding positions in the three input lists (sample_indices,
-// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
-// with dim_size() equal to the total number of lookups into the table described by
-// the corresponding feature.
-//
-// Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example to
-// which the corresponding embedding_indices and aggregation_weights values
-// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-// It corresponds to sp_ids.values in embedding_lookup_sparse().
-//	aggregation_weights: A list of rank 1 Tensors containing per training example
-// aggregation weights. It corresponds to sp_weights.values in
-// embedding_lookup_sparse().
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//	table_ids: A list of integers specifying the identifier of the embedding table
-// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
-// corresponding input. The ith input is looked up using table_ids[i]. The size
-// of the table_ids list must be equal to that of sample_indices,
-// embedding_indices and aggregation_weights.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"table_ids": table_ids}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the mean along sparse segments of a tensor.
-//
-// See `tf.sparse.segment_sum` for usage examples.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
-
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
+// Computes the mean of elements across dimensions of a tensor.
 //
 // Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
@@ -24509,7 +37187,7 @@ func AnyKeepDims(value bool) AnyAttr {
 // `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24518,7 +37196,7 @@ func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "Mean",
 		Input: []tf.Input{
 			input, axis,
 		},
@@ -24528,6 +37206,421 @@ func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (ou
 	return op.Output(0)
 }
 
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
+
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+//
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Concat the elements from the TensorArray into value `value`.
+//
+// Takes `T` elements of shapes
+//
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
+//
+// and concatenates them into a Tensor of shape:
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+//
+// All elements must have the same shape (excepting the first dimension).
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayConcatV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
+		Input: []tf.Input{
+			parameters, accumulators, updates, gradient_accumulators,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+//
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+//
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+//
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
+//
+// Arguments:
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedLengthRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
+//
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
+//
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IdentityReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Restore a reader to a previously saved state.
+//
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
+//
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderRestoreStateV2",
+		Input: []tf.Input{
+			reader_handle, state,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Writes contents to the file at input filename. Creates file and recursively
 //
 // creates directory if not existing.
@@ -24550,6 +37643,353 @@ func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Oper
 	return scope.AddOperation(opspec)
 }
 
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeArea",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeBicubicHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBicubicHalfPixelCenters(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Resize `images` to `size` using bicubic interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
+
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeBicubicGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBicubicGradHalfPixelCenters(value bool) ResizeBicubicGradAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Computes the gradient of bicubic interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubicGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBoxV2",
+		Input: []tf.Input{
+			image_size, bounding_boxes, min_object_covered,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // ResizeBilinearAttr is an optional argument to ResizeBilinear.
 type ResizeBilinearAttr func(optionalAttr)
 
@@ -24602,3709 +38042,19 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
+// LeakyReluAttr is an optional argument to LeakyRelu.
+type LeakyReluAttr func(optionalAttr)
 
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
-	}
-}
-
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_max"] = value
-	}
-}
-
-// Use QuantizeAndDequantizeV2 instead.
-//
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the index of a data point that should be added to the seed set.
-//
-// Entries in distances are assumed to be squared distances of candidate points to
-// the already sampled centers in the seed set. The op constructs one Markov chain
-// of the k-MC^2 algorithm and returns the index of one candidate point to be added
-// as an additional cluster center.
-//
-// Arguments:
-//	distances: Vector with squared distances to the closest previously sampled cluster center
-// for each candidate point.
-//	seed: Scalar. Seed for initializing the random number generator.
-//
-// Returns Scalar with the index of the sampled point.
-func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "KMC2ChainInitialization",
-		Input: []tf.Input{
-			distances, seed,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
-type ExperimentalParseExampleDatasetAttr func(optionalAttr)
-
-// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
-// If not specified, defaults to false
-func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
-	return func(m optionalAttr) {
-		m["sloppy"] = value
-	}
-}
-
-// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
-//
-// Arguments:
-//
-//
-//	dense_defaults: A dict mapping string keys to `Tensor`s.
-// The keys of the dict must match the dense_keys of the feature.
-//	sparse_keys: A list of string keys in the examples features.
-// The results for these keys will be returned as `SparseTensor` objects.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples features associated with dense values.
-//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-// and `tf.string` (`BytesList`) are supported.
-//	dense_shapes: List of tuples with the same length as `dense_keys`.
-// The shape of the data for each dense feature referenced by `dense_keys`.
-// Required for any input tensors identified by `dense_keys`.  Must be
-// either fully defined, or may contain an unknown first dimension.
-// An unknown first dimension means the feature is treated as having
-// a variable number of blocks, and the output shape along this dimension
-// is considered unknown at graph build time.  Padding is applied for
-// minibatch elements smaller than the maximum number of blocks for the
-// given feature along this dimension.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalParseExampleDataset",
-		Input: []tf.Input{
-			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//
-//	num_threads: Identifies the number of threads to use for the private threadpool.
-//
-//
-func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalPrivateThreadPoolDataset",
-		Input: []tf.Input{
-			input_dataset, num_threads,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// This op consumes a lock created by `MutexLock`.
-//
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
-//
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
-//
-// Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
-//
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
-		Input: []tf.Input{
-			mutex_lock,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = max(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMax",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StringFormatAttr is an optional argument to StringFormat.
-type StringFormatAttr func(optionalAttr)
-
-// StringFormatTemplate sets the optional template attribute to value.
-//
-// value: A string, the template to format tensor summaries into.
-// If not specified, defaults to "%s"
-func StringFormatTemplate(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["template"] = value
-	}
-}
-
-// StringFormatPlaceholder sets the optional placeholder attribute to value.
-//
-// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
-// If not specified, defaults to "%s"
-func StringFormatPlaceholder(value string) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["placeholder"] = value
-	}
-}
-
-// StringFormatSummarize sets the optional summarize attribute to value.
-//
-// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
-// If not specified, defaults to 3
-func StringFormatSummarize(value int64) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Formats a string template using a list of tensors.
-//
-// Formats a string template using a list of tensors, pretty-printing tensor summaries.
-//
-// Arguments:
-//	inputs: The list of tensors to format into the placeholder string.
-//
-// Returns = The resulting string scalar.
-func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringFormat",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
-type InfeedEnqueueTupleAttr func(optionalAttr)
-
-// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence for
-// all the tuple shapes, in the order the shapes appear in the "shapes" input.
-// The layout elements for a sub-shape can be set to -1, in which case the
-// corresponding layout will be computed by the infeed operation.
-// If not specified, defaults to <>
-func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["layouts"] = value
-	}
-}
-
-// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// Feeds multiple Tensor values into the computation as an XLA tuple.
-//
-// Arguments:
-//	inputs: A list of tensors that will be provided using the infeed mechanism.
-//	shapes: The shapes of each tensor in `inputs`.
-//
-// Returns the created operation.
-func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shapes": shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueueTuple",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
-		Input: []tf.Input{
-			value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
-//
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReorder",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
-// indices are not specified, joins across all dimensions beginning from `n - 1`
-// through `0`.
-//
-// For example:
-//
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> "acbd"
-// tf.reduce_join(a, [1, 0]) ==> "abcd"
-// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
-// ```
-//
-// Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-//
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
-		Input: []tf.Input{
-			inputs, reduction_indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes natural logarithm of (1 + x) element-wise.
-//
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log1p",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The gradient of SparseFillEmptyRows.
-//
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
-//
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
-//
-// Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
-//
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
-		Input: []tf.Input{
-			reverse_index_map, grad_values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns which elements of x are NaN.
-//
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsNan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Calculates the prior from the training data (the bias) and fills in the first node with the logits' prior. Returns a boolean indicating whether to continue centering.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	mean_gradients: A tensor with shape=[logits_dimension] with mean of gradients for a first node.
-//	mean_hessians: A tensor with shape=[logits_dimension] mean of hessians for a first node.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//
-// Returns Bool, whether to continue bias centering.
-func BoostedTreesCenterBias(scope *Scope, tree_ensemble_handle tf.Output, mean_gradients tf.Output, mean_hessians tf.Output, l1 tf.Output, l2 tf.Output) (continue_centering tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCenterBias",
-		Input: []tf.Input{
-			tree_ensemble_handle, mean_gradients, mean_hessians, l1, l2,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
-
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Computes size of weights that can be used by a Cudnn RNN model.
-//
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T, "S": S}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
-		Input: []tf.Input{
-			num_layers, num_units, input_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
-type ConfigureDistributedTPUAttr func(optionalAttr)
-
-// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
-//
-// value: Reserved. Do not use.
-// If not specified, defaults to ""
-func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["embedding_config"] = value
-	}
-}
-
-// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
-//
-// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-// describes the embedding lookups of the program.
-// If not specified, defaults to ""
-func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["tpu_embedding_config"] = value
-	}
-}
-
-// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
-//
-// value: Reserved. Do not use.
-// If not specified, defaults to false
-func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
-	return func(m optionalAttr) {
-		m["is_global_init"] = value
-	}
-}
-
-// Sets up the centralized structures for a distributed TPU system.
-//
-// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
-// topology.
-func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ConfigureDistributedTPU",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
-//
-//     values.shape = input.shape[:-1]
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
-//
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NthElement",
-		Input: []tf.Input{
-			input, n,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RegexReplaceAttr is an optional argument to RegexReplace.
-type RegexReplaceAttr func(optionalAttr)
-
-// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
-//
-// value: If True, the replacement is global (that is, all matches of the `pattern` regular
-// expression in each input string are rewritten), otherwise the `rewrite`
-// substitution is only made for the first `pattern` match.
-// If not specified, defaults to true
-func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
-	return func(m optionalAttr) {
-		m["replace_global"] = value
-	}
-}
-
-// Replaces matches of the `pattern` regular expression in `input` with the
-// replacement string provided in `rewrite`.
-//
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to be matched in the `input` strings.
-//	rewrite: The rewrite string to be substituted for the `pattern` expression where it is
-// matched in the `input` strings.
-//
-// Returns The text after applying pattern match and rewrite substitution.
-func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RegexReplace",
-		Input: []tf.Input{
-			input, pattern, rewrite,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that passes a sliding window over `input_dataset`.
-//
-// Arguments:
-//
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//
-//
-func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalSlidingWindowDataset",
-		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
-//
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
-		Input: []tf.Input{
-			input, pattern,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
-//
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash can be used to make it difficult to find inputs with a skewed hash value
-// distribution over buckets. This requires that the hash function is
-// seeded by a high-entropy (random) "key" unknown to the adversary.
-//
-// The additional robustness comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key used to seed the hash function, passed as a list of two uint64
-// elements.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the matrix square root of one or more square matrices:
-//
-// matmul(sqrtm(A), sqrtm(A)) = A
-//
-// The input matrix should be invertible. If the input matrix is real, it should
-// have no eigenvalues which are real and negative (pairs of complex conjugate
-// eigenvalues are allowed).
-//
-// The matrix square root is computed by first reducing the matrix to
-// quasi-triangular form with the real Schur decomposition. The square root
-// of the quasi-triangular matrix is then computed directly. Details of
-// the algorithm can be found in: Nicholas J. Higham, "Computing real
-// square roots of a real matrix", Linear Algebra Appl., 1987.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the matrix square root for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.sqrtm
-// @end_compatibility
-func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSquareRoot",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
-//
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pattern": pattern}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
-type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
-//
-// Attributes `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-//
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-// If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// Quantization is called fake since the output is still in floating point.
-func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgs",
-		Input: []tf.Input{
-			inputs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
-type ResourceApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
-//
-// accum = accum * momentum - lr * grad
-// var += accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyKerasMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes natural logarithm of x element-wise.
-//
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
-
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AdaMax algorithm.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
-//
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
-//
-//
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
-	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An Op to exchange data across TPU replicas.
-//
-// On each replica, the input is split into `split_count` blocks along
-// `split_dimension` and send to the other replicas given group_assignment. After
-// receiving `split_count` - 1 blocks from other replicas, we concatenate the
-// blocks along `concat_dimension` as the output.
-//
-// For example, suppose there are 2 TPU replicas:
-// replica 0 receives input: `[[A, B]]`
-// replica 1 receives input: `[[C, D]]`
-//
-// group_assignment=`[[0, 1]]`
-// concat_dimension=0
-// split_dimension=1
-// split_count=2
-//
-// replica 0's output: `[[A], [C]]`
-// replica 1's output: `[[B], [D]]`
-//
-// Arguments:
-//	input: The local input to the sum.
-//	group_assignment: An int32 tensor with shape
-// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-// replica ids in the ith subgroup.
-//	concat_dimension: The dimension number to concatenate.
-//	split_dimension: The dimension number to split.
-//	split_count: The number of splits, this number must equal to the sub-group
-// size(group_assignment.get_shape()[1])
-//
-// Returns The exchanged result.
-func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
-	opspec := tf.OpSpec{
-		Type: "AllToAll",
-		Input: []tf.Input{
-			input, group_assignment,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x < y) element-wise.
-//
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Less",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
-type CudnnRNNBackpropAttr func(optionalAttr)
-
-// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Backprop step of CudnnRNN.
-//
-// Compute the backprop of both data and weights in a RNN.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in for forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackprop",
-		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
-//
-// Arguments:
-//	input: Base64 strings to decode.
-//
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Gets next element for the provided shard number.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//	shard_num: Integer representing which shard to fetch data for.
-//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Result of the get_next on the dataset.
-func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorGetNextFromShard",
-		Input: []tf.Input{
-			multi_device_iterator, shard_num, incarnation_id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
-		return
-	}
-	return components
-}
-
-// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
-type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load ADAM embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the ADAM optimization algorithm.
-//	momenta: Value of momenta used in the ADAM optimization algorithm.
-//	velocities: Value of velocities used in the ADAM optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingADAMParameters",
-		Input: []tf.Input{
-			parameters, momenta, velocities,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Generate a sharded filename. The filename is printf formatted as
-//
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Concat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserializes a serialized tree ensemble config and replaces current tree
-//
-// ensemble.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	stamp_token: Token to use as the new value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
-//
-// Returns the created operation.
-func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesDeserializeEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
-type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
-type UnicodeEncodeAttr func(optionalAttr)
-
-// UnicodeEncodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD (U+65533).
-// If not specified, defaults to 65533
-func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// Encode a tensor of ints into unicode strings.
-//
-// Returns a vector of strings, where `output[i]` is constructed by encoding the
-// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
-// using `output_encoding`.
-//
-// ---
-//
-// Example:
-//
-// ```
-// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
-// input_splits = [0, 5, 10]
-// output_encoding = 'UTF-8'
-//
-// output = ['Hello', 'World']
-// ```
-//
-// Arguments:
-//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
-//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
-// In particular, `output[i]` is constructed by encoding the codepoints in the
-// slice `input_values[input_splits[i]:input_splits[i+1]]`.
-//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
-// "UTF-16-BE", and "UTF-32-BE"`.
-//
-// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
-func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_encoding": output_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeEncode",
-		Input: []tf.Input{
-			input_values, input_splits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumprod",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
-type UnicodeTranscodeAttr func(optionalAttr)
-
-// UnicodeTranscodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-//
-// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
-// as ' ', will preserve string alignment to the source since invalid bytes will be
-// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
-// replacement character will preserve byte alignment to the source.
-// If not specified, defaults to 65533
-func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// Transcode the input text from a source encoding to a destination encoding.
-//
-// The input is a string tensor of any shape. The output is a string tensor of
-// the same shape containing the transcoded strings. Output strings are always
-// valid unicode. If the input contains invalid encoding positions, the
-// `errors` attribute sets the policy for how to deal with them. If the default
-// error-handling policy is used, invalid formatting will be substituted in the
-// output by the `replacement_char`. If the errors policy is to `ignore`, any
-// invalid encoding positions in the input are skipped and not included in the
-// output. If it set to `strict` then any invalid formatting will result in an
-// InvalidArgument error.
-//
-// This operation can be used with `output_encoding = input_encoding` to enforce
-// correct formatting for inputs even if they are already in the desired encoding.
-//
-// If the input is prefixed by a Byte Order Mark needed to determine encoding
-// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
-// BOM will be consumed and not emitted into the output. If the input encoding
-// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
-// interpreted as a non-breaking-space and is preserved in the output (including
-// always for UTF-8).
-//
-// The end result is that if the input is marked as an explicit endianness the
-// transcoding is faithful to all codepoints in the source. If it is not marked
-// with an explicit endianness, the BOM is not considered part of the string itself
-// but as metadata, and so is not preserved in the output.
-//
-// Arguments:
-//	input: The text to be processed. Can have any shape.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//	output_encoding: The unicode encoding to use in the output. Must be one of
-// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
-//
-// Returns A string tensor containing unicode text encoded using `output_encoding`.
-func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeTranscode",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
-//
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
-		Input: []tf.Input{
-			input, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
-type UnicodeDecodeAttr func(optionalAttr)
-
-// UnicodeDecodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// UnicodeDecodeTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func UnicodeDecodeTsplits(value tf.DataType) UnicodeDecodeAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints for
-// each input string begin and end within the `char_values` tensor.
-// In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
-func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecode",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
-
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-//
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
-
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayGatherV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
-		Input: []tf.Input{
-			handle, indices, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
-		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
-type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
-
-// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Converts CudnnRNN params from canonical form to usable form.
-//
-// Writes a set of weights into the opaque params buffer so they can be used in
-// upcoming training or inferences.
-//
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNCanonicalToParams",
-		Input: []tf.Input{
-			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of tensors in the input tensor list.
-//
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListLength",
-		Input: []tf.Input{
-			input_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DepthToSpaceAttr is an optional argument to DepthToSpace.
-type DepthToSpaceAttr func(optionalAttr)
-
-// DepthToSpaceDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthToSpace for tensors of type T.
-//
-// Rearranges data from depth into blocks of spatial data.
-// This is the reverse transformation of SpaceToDepth. More specifically,
-// this op outputs a copy of the input tensor where values from the `depth`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions.
-// The attr `block_size` indicates the input block size and how the data is moved.
-//
-//   * Chunks of data of size `block_size * block_size` from depth are rearranged
-//     into non-overlapping blocks of size `block_size x block_size`
-//   * The width the output tensor is `input_depth * block_size`, whereas the
-//     height is `input_height * block_size`.
-//   * The Y, X coordinates within each block of the output image are determined
-//     by the high order component of the input channel index.
-//   * The depth of the input tensor must be divisible by
-//     `block_size * block_size`.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-//                         within the input image, bX, bY means coordinates
-//                         within the output block, oC means output channels).
-//      The output would be the input transposed to the following layout:
-//      n,iY,bY,iX,bX,oC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1, 2, 3, 4]]]]
-//
-// ```
-//
-// This operation will output a tensor of shape `[1, 2, 2, 1]`:
-//
-// ```
-//    [[[[1], [2]],
-//      [[3], [4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-// the corresponding output will have 2x2 elements and will have a depth of
-// 1 channel (1 = `4 / (block_size * block_size)`).
-// The output element shape is `[2, 2, 1]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// This operation, for block size of 2, will return the following tensor of shape
-// `[1, 2, 2, 3]`
-//
-// ```
-//    [[[[1, 2, 3], [4, 5, 6]],
-//      [[7, 8, 9], [10, 11, 12]]]]
-//
-// ```
-//
-// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-//
-// ```
-// x =  [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 4 4 1]`:
-//
-// ```
-// x = [[[ [1],   [2],  [5],  [6]],
-//       [ [3],   [4],  [7],  [8]],
-//       [ [9],  [10], [13],  [14]],
-//       [ [11], [12], [15],  [16]]]]
-//
-// ```
-//
-// Arguments:
-//
-//	block_size: The size of the spatial block, same as in Space2Depth.
-func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthToSpace",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
-type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Momentum embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Momentum optimization algorithm.
-//	momenta: Value of momenta used in the Momentum optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
-		Input: []tf.Input{
-			parameters, momenta, gradient_accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// BoostedTreesQuantileStreamResourceHandleOpAttr is an optional argument to BoostedTreesQuantileStreamResourceHandleOp.
-type BoostedTreesQuantileStreamResourceHandleOpAttr func(optionalAttr)
-
-// BoostedTreesQuantileStreamResourceHandleOpContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BoostedTreesQuantileStreamResourceHandleOpContainer(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// BoostedTreesQuantileStreamResourceHandleOpSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BoostedTreesQuantileStreamResourceHandleOpSharedName(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a BoostedTreesQuantileStreamResource.
-func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...BoostedTreesQuantileStreamResourceHandleOpAttr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceHandleOp",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV3",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the sampling method for resizing. It can be either
-// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
-// methods are supported: Bilinear and Nearest Neighbor.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
-//
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
-	}
-}
-
-// Extracts crops from the input image tensor and resizes them.
-//
-// Extracts crops from the input image tensor and resizes them using bilinear
-// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
-// common output size specified by `crop_size`. This is more general than the
-// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
-// and does not allow resizing or aspect ratio change.
-//
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear or nearest neighbor interpolation) to a fixed
-// `size = [crop_height, crop_width]`. The result is a 4-D tensor
-// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
-// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
-// results to using `tf.image.resize_bilinear()` or
-// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
-// `align_corners=True`.
-//
-// Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
-//
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CropAndResize",
-		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concats all tensors in the list along the 0th dimension.
-//
-// Requires that all tensors have the same shape except the first dimension.
-//
-// input_handle: The input list.
-// element_shape: The shape of the uninitialized elements in the list. If the first
-//   dimension is not -1, it is assumed that all list elements have the same
-//   leading dim.
-// leading_dims: The list of leading dims of uninitialized list elements. Used if
-//   the leading dim of input_handle.element_shape or the element_shape input arg
-//   is not already set.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
-//
-func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListConcatV2",
-		Input: []tf.Input{
-			input_handle, element_shape, leading_dims,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-//
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns element-wise smallest integer not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Ceil",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
-type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve FTRL embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
-func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Min",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CompilationResultProto indicating the status of the TPU compilation.
-func TPUCompilationResult(scope *Scope) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUCompilationResult",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
-
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
-//
-// The op uses LU decomposition with partial pivoting to compute the inverses.
-//
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
-type ResourceScatterNdSubAttr func(optionalAttr)
-
-// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse subtraction to individual values or slices in a Variable.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
-// ```
-//
-// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-// with 8 elements. In Python, that subtraction would look like this:
-//
-// ```python
-// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-// indices = tf.constant([[4], [3], [1], [7]])
-// updates = tf.constant([9, 10, 11, 12])
-// sub = tf.scatter_nd_sub(ref, indices, updates)
-// with tf.Session() as sess:
-//   print sess.run(sub)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, -9, 3, -6, -4, 6, 7, -4]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdSub",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
-//
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
-type InfeedEnqueueAttr func(optionalAttr)
-
-// InfeedEnqueueShape sets the optional shape attribute to value.
-//
-// value: The shape of the tensor.
-// If not specified, defaults to <>
-func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// InfeedEnqueueLayout sets the optional layout attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence.
-// If a layout attribute is passed, but its values are all -1, the layout will
-// be computed by the infeed operation.
-// If not specified, defaults to <>
-func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op
-// is running on a TPU device, and >= 0 when the Op is running on the CPU
-// device.
-// If not specified, defaults to -1
-func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which feeds a single Tensor value into the computation.
-//
-// Arguments:
-//	input: A tensor that will be provided using the infeed mechanism.
-//
-// Returns the created operation.
-func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueue",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
-//
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
-		Input: []tf.Input{
-			images, scale,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that shuffles and repeats elements from `input_dataset`
-//
-// pseudorandomly.
-//
-// Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//	count: A scalar representing the number of times the underlying dataset
-// should be repeated. The default is `-1`, which results in infinite repetition.
-//
-//
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ShuffleAndRepeatDataset",
-		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex tensor.
-//
-// Returns A complex tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT3D",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the proximal adagrad scheme.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
-		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 2D real-valued fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
-//
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
-		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
+// LeakyReluAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluAlpha(value float32) LeakyReluAttr {
 	return func(m optionalAttr) {
 		m["alpha"] = value
 	}
 }
 
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-//
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+// Computes rectified linear: `max(features, features * alpha)`.
+func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28313,9 +38063,9 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "LeakyRelu",
 		Input: []tf.Input{
-			input,
+			features,
 		},
 		Attrs: attrs,
 	}
@@ -28323,115 +38073,214 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
-type CudnnRNNBackpropV3Attr func(optionalAttr)
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
 
-// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["rnn_mode"] = value
+		m["align_corners"] = value
 	}
 }
 
-// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
+// QuantizedResizeBilinearHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func QuantizedResizeBilinearHalfPixelCenters(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["input_mode"] = value
+		m["half_pixel_centers"] = value
 	}
 }
 
-// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
+//
+// Input images and output images must be quantized types.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedResizeBilinear",
+		Input: []tf.Input{
+			images, size, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// DecodePaddedRawAttr is an optional argument to DecodePaddedRaw.
+type DecodePaddedRawAttr func(optionalAttr)
+
+// DecodePaddedRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `input_bytes` is in little-endian order. Ignored for
+// `out_type` values that are stored in a single byte, like `uint8`
+// If not specified, defaults to true
+func DecodePaddedRawLittleEndian(value bool) DecodePaddedRawAttr {
 	return func(m optionalAttr) {
-		m["direction"] = value
+		m["little_endian"] = value
 	}
 }
 
-// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	input_bytes: Tensor of string to be decoded.
+//	fixed_length: Length in bytes for each element of the decoded output. Must be a multiple
+// of the size of the output type.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`. The added dimension
+// will have size equal to the length of the elements of `bytes` divided by the
+// number of bytes to represent `out_type`.
+func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output, out_type tf.DataType, optional ...DecodePaddedRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodePaddedRaw",
+		Input: []tf.Input{
+			input_bytes, fixed_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
+
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// ResizeBilinearGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
+// If not specified, defaults to false
+func ResizeBilinearGradHalfPixelCenters(value bool) ResizeBilinearGradAttr {
+	return func(m optionalAttr) {
+		m["half_pixel_centers"] = value
+	}
+}
+
+// Computes the gradient of bilinear interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBilinearGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
+
+// RandomCropSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
+func RandomCropSeed(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
+func RandomCropSeed2(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// CudnnRNNBackpropV3TimeMajor sets the optional time_major attribute to value.
-// If not specified, defaults to true
-func CudnnRNNBackpropV3TimeMajor(value bool) CudnnRNNBackpropV3Attr {
-	return func(m optionalAttr) {
-		m["time_major"] = value
-	}
-}
-
-// Backprop step of CudnnRNNV3.
+// Randomly crop `image`.
 //
-// Compute the backprop of both data and weights in a RNN. Takes an extra
-//     "sequence_lengths" input than CudnnRNNBackprop.
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
-//     [batch_size, seq_length, input_size].
-// input_h: If time_major is true, this is a 3-D tensor with the shape of
-//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
-//     is [batch_size, num_layer * dir, num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// sequence_lengths: a vector of lengths of each input sequence.
-// output: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
-//     shape is [batch_size, seq_length, dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// time_major: Indicates whether the input/output format is time major or batch
-//     major.
-// reserve_space: The same reserve_space produced in the forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
+//
+// Arguments:
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28440,177 +38289,9 @@ func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackpropV3",
+		Type: "RandomCrop",
 		Input: []tf.Input{
-			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
-type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve SGD embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
-func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a copy of the input tensor.
-func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Snapshot",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Decodes a `variant` Tensor into a `RaggedTensor`.
-//
-// Decodes the given `variant` Tensor and returns a `RaggedTensor`. The input
-// could be a scalar, meaning it encodes a single `RaggedTensor` with ragged_rank
-// `output_ragged_rank`. It could also have an arbitrary rank, in which case each
-// element is decoded into a `RaggedTensor` with ragged_rank `input_ragged_rank`
-// and these are then stacked according to the input shape to output a single
-// `RaggedTensor` with ragged_rank `output_ragged_rank`. Each `variant` element in
-// the input Tensor is decoded by retrieving from the element a 1-D `variant`
-// Tensor with `input_ragged_rank + 1` Tensors, corresponding to the splits and
-// values of the decoded `RaggedTensor`. If `input_ragged_rank` is -1, then it is
-// inferred as `output_ragged_rank` - `rank(encoded_ragged)`. See
-// `RaggedTensorToVariant` for the corresponding encoding logic.
-//
-//
-// Arguments:
-//	encoded_ragged: A `variant` Tensor containing encoded `RaggedTensor`s.
-//	input_ragged_rank: The ragged rank of each encoded `RaggedTensor` component in the input. If set to
-// -1, this is inferred as `output_ragged_rank` - `rank(encoded_ragged)`
-//	output_ragged_rank: The expected ragged rank of the output `RaggedTensor`. The following must hold:
-// `output_ragged_rank = rank(encoded_ragged) + input_ragged_rank`.
-//
-//
-//
-// Returns A list of one or more Tensors representing the splits of the output
-// `RaggedTensor`.A Tensor representing the values of the output `RaggedTensor`.
-func RaggedTensorFromVariant(scope *Scope, encoded_ragged tf.Output, input_ragged_rank int64, output_ragged_rank int64, Tvalues tf.DataType, Tsplits tf.DataType) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_ragged_rank": input_ragged_rank, "output_ragged_rank": output_ragged_rank, "Tvalues": Tvalues, "Tsplits": Tsplits}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorFromVariant",
-		Input: []tf.Input{
-			encoded_ragged,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedTensorFromVariant", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
-}
-
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			image, size,
 		},
 		Attrs: attrs,
 	}
@@ -28618,918 +38299,21 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
-
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x + y element-wise, working on quantized buffers.
-//
-// Arguments:
-//
-//
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-//
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
-		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes fingerprints of the input strings.
-//
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
-//
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+// Constructs an Optional variant from a tuple of tensors.
+func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "OptionalFromValue",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(components),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
-
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
-		Input: []tf.Input{
-			logits, num_samples, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of work units this Reader has finished processing.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
-
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the size of a tensor.
-//
-// This operation returns an integer representing the number of elements in
-// `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
-// ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Size",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalRandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UpperBoundAttr is an optional argument to UpperBound.
-type UpperBoundAttr func(optionalAttr)
-
-// UpperBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Applies upper_bound(sorted_search_values, values) along each row.
-//
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='right')`.
-//
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
-//
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
-//
-//   result = UpperBound(sorted_sequence, values)
-//
-//   result == [[1, 2, 4],
-//              [0, 2, 5]]
-//
-// Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
-//
-// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UpperBound",
-		Input: []tf.Input{
-			sorted_inputs, values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedConv2DPerChannelAttr is an optional argument to QuantizedConv2DPerChannel.
-type QuantizedConv2DPerChannelAttr func(optionalAttr)
-
-// QuantizedConv2DPerChannelOutType sets the optional out_type attribute to value.
-//
-// value: The quantized type of output tensor that needs to be converted.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChannelAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
-//
-// value: list of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes QuantizedConv2D per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	min_input: The minimum value of the input tensor
-//	max_input: The maximum value of the input tensor.
-//	min_filter: The minimum value of the filter tensor.
-//	max_filter: The maximum value of the filter tensor.
-//	strides: list of stride values.
-//
-//
-// Returns The output tensor.The minimum value of the final output tensor.The maximum value of the final output tensor.
-func QuantizedConv2DPerChannel(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DPerChannelAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConv2DPerChannel",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalNone",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluAttr is an optional argument to QuantizedDepthwiseConv2DWithBiasAndRelu.
-type QuantizedDepthwiseConv2DWithBiasAndReluAttr func(optionalAttr)
-
-// QuantizedDepthwiseConv2DWithBiasAndReluOutType sets the optional out_type attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_QINT32
-func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
-//
-// value: List of dilation values.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes quantized depthwise Conv2D with Bias and Relu.
-//
-// Arguments:
-//	input: The original input tensor.
-//	filter: The original filter tensor.
-//	bias: The original bias tensor.
-//	min_input: The float value that the minimum quantized input value represents.
-//	max_input: The float value that the maximum quantized input value represents.
-//	min_filter: The float value that the minimum quantized filter value represents.
-//	max_filter: The float value that the maximum quantized filter value represents.
-//	strides: List of stride values.
-//
-//
-// Returns The output tensor.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedDepthwiseConv2DWithBiasAndRelu(scope *Scope, input tf.Output, filter tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedDepthwiseConv2DWithBiasAndReluAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedDepthwiseConv2DWithBiasAndRelu",
-		Input: []tf.Input{
-			input, filter, bias, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
-type RequantizePerChannelAttr func(optionalAttr)
-
-// RequantizePerChannelOutType sets the optional out_type attribute to value.
-//
-// value: The quantized type of output tensor that needs to be converted.
-// If not specified, defaults to DT_QUINT8
-func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Requantizes input with min and max values known per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	input_min: The minimum value of the input tensor
-//	input_max: The maximum value of the input tensor.
-//	requested_output_min: The minimum value of the output tensor requested.
-//	requested_output_max: The maximum value of the output tensor requested.
-//
-// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
-func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RequantizePerChannel",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Outputs a tensor containing the reduction across all input tensors.
-//
-// Outputs a tensor containing the reduction across all input tensors passed to ops
-// within the same `shared_name.
-//
-// The graph should be constructed so if one op runs with shared_name value `c`,
-// then `num_devices` ops will run with shared_name value `c`.  Failure to do so
-// will cause the graph execution to fail to complete.
-//
-// input: the input to the reduction
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-// num_devices: The number of devices participating in this reduction.
-// shared_name: Identifier that shared between ops of the same reduction.
-func NcclAllReduce(scope *Scope, input tf.Output, reduction string, num_devices int64, shared_name string) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"reduction": reduction, "num_devices": num_devices, "shared_name": shared_name}
-	opspec := tf.OpSpec{
-		Type: "NcclAllReduce",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
-//
-// is alive, any other request to use `MutexLock` with this mutex will wait.
-//
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
-//
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
-//
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
-// ```
-//
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
-//
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
-//
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
-//
-// Arguments:
-//	mutex: The mutex resource to lock.
-//
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MutexLock",
-		Input: []tf.Input{
-			mutex,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reduces `input` from `num_devices` using `reduction` to a single device.
-//
-// Reduces `input` from `num_devices` using `reduction` to a single device.
-//
-// The graph should be constructed so that all inputs have a valid device
-// assignment, and the op itself is assigned one of these devices.
-//
-// input: The input to the reduction.
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"reduction": reduction}
-	opspec := tf.OpSpec{
-		Type: "NcclReduce",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Selects num_to_sample rows of input using the KMeans++ criterion.
-//
-// Rows of points are assumed to be input points. One row is selected at random.
-// Subsequent rows are sampled with probability proportional to the squared L2
-// distance from the nearest row selected thus far till num_to_sample rows have
-// been sampled.
-//
-// Arguments:
-//	points: Matrix of shape (n, d). Rows are assumed to be input points.
-//	num_to_sample: Scalar. The number of rows to sample. This value must not be larger than n.
-//	seed: Scalar. Seed for initializing the random number generator.
-//	num_retries_per_sample: Scalar. For each row that is sampled, this parameter
-// specifies the number of additional points to draw from the current
-// distribution before selecting the best. If a negative value is specified, a
-// heuristic is used to sample O(log(num_to_sample)) additional points.
-//
-// Returns Matrix of shape (num_to_sample, d). The sampled rows.
-func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample tf.Output, seed tf.Output, num_retries_per_sample tf.Output) (samples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "KmeansPlusPlusInitialization",
-		Input: []tf.Input{
-			points, num_to_sample, seed, num_retries_per_sample,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of x OR y element-wise.
-//
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Add all input tensors element wise.
-//
-// Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddN",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
-//
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Complex",
-		Input: []tf.Input{
-			real, imag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BoostedTreesCalculateBestFeatureSplitAttr is an optional argument to BoostedTreesCalculateBestFeatureSplit.
-type BoostedTreesCalculateBestFeatureSplitAttr func(optionalAttr)
-
-// BoostedTreesCalculateBestFeatureSplitSplitType sets the optional split_type attribute to value.
-//
-// value: A string indicating if this Op should perform inequality split or equality split.
-// If not specified, defaults to "inequality"
-func BoostedTreesCalculateBestFeatureSplitSplitType(value string) BoostedTreesCalculateBestFeatureSplitAttr {
-	return func(m optionalAttr) {
-		m["split_type"] = value
-	}
-}
-
-// Calculates gains for each feature and returns the best possible split information for the feature.
-//
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
-//
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
-//
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
-//
-// The output shapes are compatible in a way that the first dimension of all tensors are the same and equal to the number of possible split nodes for each feature.
-//
-// Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary: A Rank 4 tensor (#shape=[max_splits, feature_dims, bucket, stats_dims]) for accumulated stats summary (gradient/hessian) per node, per dimension, per buckets for each feature.
-// The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	logits_dimension: The dimension of logit, i.e., number of classes.
-//
-// Returns A Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.A Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.A Rank 1 tensors indicating the best feature dimension for each feature to split for certain nodes if the feature is multi-dimension. See above for details like shapes and sizes.A Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.A Rank 1 tensors indicating the which direction to go if data is missing. See above for details like shapes and sizes.
-func BoostedTreesCalculateBestFeatureSplit(scope *Scope, node_id_range tf.Output, stats_summary tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, logits_dimension int64, optional ...BoostedTreesCalculateBestFeatureSplitAttr) (node_ids tf.Output, gains tf.Output, feature_dimensions tf.Output, thresholds tf.Output, left_node_contribs tf.Output, right_node_contribs tf.Output, split_with_default_directions tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestFeatureSplit",
-		Input: []tf.Input{
-			node_id_range, stats_summary, l1, l2, tree_complexity, min_node_weight,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
-}
-
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-//
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
-//
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
-type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Retrieve FTRL embedding parameters.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
-func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingFTRLParameters",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
 type DecodeAndCropJpegAttr func(optionalAttr)
 
@@ -29646,4077 +38430,6 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
-
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
-//
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
-	return func(m optionalAttr) {
-		m["batch_dim"] = value
-	}
-}
-
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
-//
-// Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
-//
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
-		Input: []tf.Input{
-			input, seq_lengths,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BatchMatMulV2Attr is an optional argument to BatchMatMulV2.
-type BatchMatMulV2Attr func(optionalAttr)
-
-// BatchMatMulV2AdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulV2AdjX(value bool) BatchMatMulV2Attr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulV2AdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulV2AdjY(value bool) BatchMatMulV2Attr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// *NOTE*: `BatchMatMulV2` supports broadcasting in the batch dimensions. More
-// about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
-//
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
-//
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMulV2(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BatchMatMulV2",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CastAttr is an optional argument to Cast.
-type CastAttr func(optionalAttr)
-
-// CastTruncate sets the optional Truncate attribute to value.
-// If not specified, defaults to false
-func CastTruncate(value bool) CastAttr {
-	return func(m optionalAttr) {
-		m["Truncate"] = value
-	}
-}
-
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"DstT": DstT}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cast",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the absolute value of a tensor.
-//
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
-//
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes numerical negative value element-wise.
-//
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_max(c, tf.constant([0, 0, 1]))
-// # ==> [[4, 3, 3, 4],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Inv",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Encode audio data using the WAV file format.
-//
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
-//
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
-//
-// Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
-//
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeWav",
-		Input: []tf.Input{
-			audio, sample_rate,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LuAttr is an optional argument to Lu.
-type LuAttr func(optionalAttr)
-
-// LuOutputIdxType sets the optional output_idx_type attribute to value.
-// If not specified, defaults to DT_INT32
-func LuOutputIdxType(value tf.DataType) LuAttr {
-	return func(m optionalAttr) {
-		m["output_idx_type"] = value
-	}
-}
-
-// Computes the LU decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be invertible.
-//
-// The output consists of two tensors LU and P containing the LU decomposition
-// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
-// upper triangular factors.
-//
-// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
-// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
-// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
-// entries correspond to the upper triangular part, including the diagonal, of LU.
-//
-// P represents a permutation matrix encoded as a list of indices each between `0`
-// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
-// P, then the L, U and P satisfies P_mat * input = L * U.
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
-// size `[M, M]`.
-//
-// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
-// lower triangular factor `L` with unit diagonal, and whose upper triangular part
-// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
-// `[..., M]`.
-// @compatibility(scipy)
-// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
-// packed into a single tensor, the permutation is applied to `input` instead of
-// the right hand side and the permutation `P` is returned as a list of indices
-// instead of a permutation matrix.
-// @end_compatibility
-func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Lu",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reciprocal",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// A placeholder op that passes through `input` when its output is not fed.
-//
-// Arguments:
-//	input: The default value to produce when `output` is not fed.
-//	shape: The (possibly partial) shape of the tensor.
-//
-// Returns A placeholder tensor that defaults to `input` if it is not fed.
-func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "PlaceholderWithDefault",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes square of x element-wise.
-//
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Square",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the sqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x == y) element-wise.
-//
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Equal",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes reciprocal of square root of x element-wise.
-//
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rsqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
-type TensorArrayConcatV3Attr func(optionalAttr)
-
-// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-//
-// value: The expected shape of an element, if known,
-// excluding the first dimension. Used to validate the shapes of
-// TensorArray elements. If this shape is not fully specified, concatenating
-// zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
-	}
-}
-
-// Concat the elements from the TensorArray into value `value`.
-//
-// Takes `T` elements of shapes
-//
-//   ```
-//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-//   ```
-//
-// and concatenates them into a Tensor of shape:
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
-//
-// All elements must have the same shape (excepting the first dimension).
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes the gradient for the rsqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Delete the tensor specified by its handle in the session.
-//
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
-//
-// Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
-//
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential of x - 1 element-wise.
-//
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Expm1",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks whether a tree has been initialized.
-//
-// Arguments:
-//	tree_handle: Handle to the tree.
-//
-// Returns Whether the tree is initialized.
-func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeIsInitializedOp",
-		Input: []tf.Input{
-			tree_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
-type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adagrad embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adagrad optimization algorithm.
-//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdagradParameters",
-		Input: []tf.Input{
-			parameters, accumulators,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
-//
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
-//
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cholesky",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Extract `patches` from `images` and put them in the "depth" output dimension.
-//
-// Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
-// ```
-//
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
-		Input: []tf.Input{
-			images,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
-		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
-type CudnnRNNV3Attr func(optionalAttr)
-
-// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNV3Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNV3Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNV3Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// CudnnRNNV3TimeMajor sets the optional time_major attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV3TimeMajor(value bool) CudnnRNNV3Attr {
-	return func(m optionalAttr) {
-		m["time_major"] = value
-	}
-}
-
-// A RNN backed by cuDNN.
-//
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
-//
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, input_size]. If time_major is false, the shape is
-//     [batch_size, seq_length, input_size].
-// input_h: If time_major is true, this is a 3-D tensor with the shape of
-//     [num_layer * dir, batch_size, num_units]. If time_major is false, the shape
-//     is [batch_size, num_layer * dir, num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// sequence_lengths: a vector of lengths of each input sequence.
-// output: If time_major is true, this is a 3-D tensor with the shape of
-//     [seq_length, batch_size, dir * num_units]. If time_major is false, the
-//     shape is [batch_size, seq_length, dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// time_major: Indicates whether the input/output format is time major or batch
-//     major.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is true.
-func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CudnnRNNV3",
-		Input: []tf.Input{
-			input, input_h, input_c, params, sequence_lengths,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
-
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-//
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
-//
-// Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
-//
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
-		Input: []tf.Input{
-			values, value_range, nbins,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
-type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 16, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsPerChannelGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
-// shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
-//   same as `gradients`.
-// min, max: Quantization interval, floats of shape `[d]`.
-//
-//
-//
-// Returns Backpropagated gradients w.r.t. inputs, shape same as
-// `inputs`:
-//   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
-// `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
-// `sum_per_d(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
-
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
-//
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
-//
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
-		Input: []tf.Input{
-			handle, n,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
-		return
-	}
-	return components
-}
-
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
-//
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Digamma",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
-//
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
-//
-// The resulting value `output` would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to slices.
-//
-// Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
-//
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
-		Input: []tf.Input{
-			input, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of the sigmoid of `x` wrt its input.
-//
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Serializes the tree handle to a proto
-//
-// Arguments:
-//	tree_handle: Handle to the tree resource to be serialized.
-//
-// Returns Serialied proto string of the tree resource.
-func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeSerialize",
-		Input: []tf.Input{
-			tree_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the trignometric inverse sine of x element-wise.
-//
-// The `tf.math.asin` operation returns the inverse of `tf.math.sin`, such that
-// if `y = tf.math.sin(x)` then, `x = tf.math.asin(y)`.
-//
-// **Note**: The output of `tf.math.asin` will lie within the invertible range
-// of sine, i.e [-pi/2, pi/2].
-//
-// For example:
-//
-// ```python
-// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
-// x = tf.constant([1.047, 0.785])
-// y = tf.math.sin(x) # [0.8659266, 0.7068252]
-//
-// tf.math.asin(y) # [1.047, 0.785] = x
-// ```
-//
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BoostedTreesEnsembleResourceHandleOpAttr is an optional argument to BoostedTreesEnsembleResourceHandleOp.
-type BoostedTreesEnsembleResourceHandleOpAttr func(optionalAttr)
-
-// BoostedTreesEnsembleResourceHandleOpContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BoostedTreesEnsembleResourceHandleOpContainer(value string) BoostedTreesEnsembleResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// BoostedTreesEnsembleResourceHandleOpSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BoostedTreesEnsembleResourceHandleOpSharedName(value string) BoostedTreesEnsembleResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a BoostedTreesEnsembleResource
-func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTreesEnsembleResourceHandleOpAttr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesEnsembleResourceHandleOp",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the trignometric inverse tangent of x element-wise.
-//
-// The `tf.math.atan` operation returns the inverse of `tf.math.tan`, such that
-// if `y = tf.math.tan(x)` then, `x = tf.math.atan(y)`.
-//
-// **Note**: The output of `tf.math.atan` will lie within the invertible range
-// of tan, i.e (-pi/2, pi/2).
-//
-// For example:
-//
-// ```python
-// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
-// x = tf.constant([1.047, 0.785])
-// y = tf.math.tan(x) # [1.731261, 0.99920404]
-//
-// tf.math.atan(y) # [1.047, 0.785] = x
-// ```
-//
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
-//
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI0e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Bessel i1e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI1e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
-
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
-//
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-// ```
-//
-// and
-//
-// ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-// ```
-//
-// then the final `SparseTensor` will be:
-//
-// ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-// ```
-//
-// Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
-//
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
-		Input: []tf.Input{
-			sparse_handles,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns which elements of x are finite.
-//
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsFinite",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-//
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sign",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Worker heartbeat op.
-//
-// Heartbeats may be sent periodically to indicate the coordinator is still active,
-// to retrieve the current worker status and to expedite shutdown when necessary.
-//
-// Arguments:
-//	request: A string tensor containing a serialized WorkerHeartbeatRequest
-//
-// Returns A string tensor containing a serialized WorkerHeartbeatResponse
-func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WorkerHeartbeat",
-		Input: []tf.Input{
-			request,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise integer closest to x.
-//
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
-//
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rint",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
-
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["vocab_size"] = value
-	}
-}
-
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
-//
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["delimiter"] = value
-	}
-}
-
-// Initializes a table from a text file.
-//
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
-//
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
-//
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
-		Input: []tf.Input{
-			table_handle, filename,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-//
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
-//
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
-//
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
-// ```
-//
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
-//
-// Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
-//
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
-		Input: []tf.Input{
-			input, threshold,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x - y element-wise.
-//
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sub",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x * y element-wise.
-//
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
-//
-// *NOTE*: `Mul` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MulNoNan",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Div",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Scatter `updates` into an existing tensor according to `indices`.
-//
-// This operation creates a new tensor by applying sparse `updates` to the passed
-// in `tensor`.
-// This operation is very similar to `tf.scatter_nd`, except that the updates are
-// scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
-// for the existing tensor cannot be re-used, a copy is made and updated.
-//
-// If `indices` contains duplicates, then their updates are accumulated (summed).
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates -- because
-// of some numerical approximation issues, numbers summed in different order
-// may yield different results.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     tensor = tf.ones([8], dtype=tf.int32)
-//     updated = tf.tensor_scatter_update(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [1, 11, 1, 10, 9, 1, 1, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     tensor = tf.ones([4, 4, 4])
-//     updated = tf.tensor_scatter_update(tensor, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	tensor: Tensor to copy/update.
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func TensorScatterUpdate(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorScatterUpdate",
-		Input: []tf.Input{
-			tensor, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x // y element-wise.
-//
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FloorDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise for integer types.
-//
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
-//
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Scatter `updates` into a new tensor according to `indices`.
-//
-// Creates a new tensor by applying sparse `updates` to individual values or
-// slices within a tensor (initially zero for numeric, empty for string) of
-// the given `shape` according to indices.  This operator is the inverse of the
-// `tf.gather_nd` operator which extracts values or slices from a given tensor.
-//
-// This operation is similar to tensor_scatter_add, except that the tensor is
-// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
-// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
-//
-// If `indices` contains duplicates, then their updates are accumulated (summed).
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates -- because
-// of some numerical approximation issues, numbers summed in different order
-// may yield different results.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [0, 11, 0, 10, 9, 0, 0, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
-//
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNd",
-		Input: []tf.Input{
-			indices, updates, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the cardinality of `input_dataset`.
-//
-// Returns the cardinality of `input_dataset`.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return cardinality for.
-//
-// Returns The cardinality of `input_dataset`. Named constants are used to represent
-// infinite and unknown cardinality.
-func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDatasetCardinality",
-		Input: []tf.Input{
-			input_dataset,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns (x - y)(x - y) element-wise.
-//
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
-
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns shape of tensors.
-//
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ShapeN",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
-		return
-	}
-	return output
-}
-
-// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
-func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Xlogy",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
-type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Momentum embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Momentum optimization algorithm.
-//	momenta: Value of momenta used in the Momentum optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingMomentumParameters",
-		Input: []tf.Input{
-			parameters, momenta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-//
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ModelDatasetAttr is an optional argument to ModelDataset.
-type ModelDatasetAttr func(optionalAttr)
-
-// ModelDatasetCpuBudget sets the optional cpu_budget attribute to value.
-// If not specified, defaults to 0
-func ModelDatasetCpuBudget(value int64) ModelDatasetAttr {
-	return func(m optionalAttr) {
-		m["cpu_budget"] = value
-	}
-}
-
-// Identity transformation that models performance.
-//
-// Identity transformation that models performance.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//
-//
-func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ModelDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ModelDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateMod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-//
-// where
-//
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
-//
-// is the upper incomplete Gama function.
-//
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igammac",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
-
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
-//
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reverse",
-		Input: []tf.Input{
-			tensor, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
-//
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Zeta",
-		Input: []tf.Input{
-			x, q,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
-//
-//
-// \\(\psi^{(a)}(x) = \frac{d^a}{dx^a} \psi(x)\\)
-//
-// where \\(\psi(x)\\) is the digamma function.
-// The polygamma function is defined only for non-negative integer orders \\a\\.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Polygamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	bytes: All the elements must have the same length.
-//
-//
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
-		Input: []tf.Input{
-			bytes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-//
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrelinearizeAttr is an optional argument to Prelinearize.
-type PrelinearizeAttr func(optionalAttr)
-
-// PrelinearizeShape sets the optional shape attribute to value.
-//
-// value: The shape of the tensor.
-// If not specified, defaults to <>
-func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
-	return func(m optionalAttr) {
-		m["shape"] = value
-	}
-}
-
-// PrelinearizeLayout sets the optional layout attribute to value.
-//
-// value: A vector holding the requested layout in minor-to-major sequence. If a layout
-// attribute is passed but its values are all -1 the layout will be computed by
-// the infeed operation.
-// If not specified, defaults to <>
-func PrelinearizeLayout(value []int64) PrelinearizeAttr {
-	return func(m optionalAttr) {
-		m["layout"] = value
-	}
-}
-
-// An op which linearizes one Tensor value to an opaque variant tensor.
-//
-// Arguments:
-//	input: A tensor that will be linearized.
-func Prelinearize(scope *Scope, input tf.Output, optional ...PrelinearizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Prelinearize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
-//
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Betainc",
-		Input: []tf.Input{
-			a, b, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
-//
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Bincount",
-		Input: []tf.Input{
-			arr, size, weights,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
-//
-//
-func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolDataset",
-		Input: []tf.Input{
-			input_dataset, thread_pool,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LessEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x >= y) element-wise.
-//
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
-//
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
-	return func(m optionalAttr) {
-		m["tolerance"] = value
-	}
-}
-
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Sets the index-th position of the list to contain the given tensor.
-//
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
-//
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
-		Input: []tf.Input{
-			input_handle, index, item,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
-//
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
-//
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
-//
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
-//
-// ```
-//
-// Arguments:
-//
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
-//
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Select",
-		Input: []tf.Input{
-			condition, x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
-
-// MatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// MatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
-//
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatMul",
-		Input: []tf.Input{
-			a, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
-
-// SumKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Sum",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EuclideanNormAttr is an optional argument to EuclideanNorm.
-type EuclideanNormAttr func(optionalAttr)
-
-// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the euclidean norm of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EuclideanNorm",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CombinedNonMaxSuppressionAttr is an optional argument to CombinedNonMaxSuppression.
-type CombinedNonMaxSuppressionAttr func(optionalAttr)
-
-// CombinedNonMaxSuppressionPadPerClass sets the optional pad_per_class attribute to value.
-//
-// value: If false, the output nmsed boxes, scores and classes
-// are padded/clipped to `max_total_size`. If true, the
-// output nmsed boxes, scores and classes are padded to be of length
-// `max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
-// which case it is clipped to `max_total_size`. Defaults to false.
-// If not specified, defaults to false
-func CombinedNonMaxSuppressionPadPerClass(value bool) CombinedNonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["pad_per_class"] = value
-	}
-}
-
-// CombinedNonMaxSuppressionClipBoxes sets the optional clip_boxes attribute to value.
-//
-// value: If true, assume the box coordinates are between [0, 1] and clip the output boxes
-// if they fall beyond [0, 1]. If false, do not do clipping and output the box
-// coordinates as it is.
-// If not specified, defaults to true
-func CombinedNonMaxSuppressionClipBoxes(value bool) CombinedNonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["clip_boxes"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// This operation performs non_max_suppression on the inputs per batch, across
-// all classes.
-// Prunes away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system. Also note that
-// this algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is the final boxes, scores and classes tensor
-// returned after performing non_max_suppression.
-//
-// Arguments:
-//	boxes: A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then
-// same boxes are used for all classes otherwise, if `q` is equal to number of
-// classes, class-specific boxes are used.
-//	scores: A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]`
-// representing a single score corresponding to each box (each row of boxes).
-//	max_output_size_per_class: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression per class
-//	max_total_size: A scalar representing maximum number of boxes retained over all classes.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
-//
-// Returns A [batch_size, max_detections, 4] float32 tensor
-// containing the non-max suppressed boxes.A [batch_size, max_detections] float32 tensor
-// containing the scores for the boxes.A [batch_size, max_detections] float32 tensor
-// containing the classes for the boxes.A [batch_size] int32 tensor indicating the number of
-// valid detections per batch item. Only the top num_detections[i] entries in
-// nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
-// entries are zero paddings.
-func CombinedNonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size_per_class tf.Output, max_total_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...CombinedNonMaxSuppressionAttr) (nmsed_boxes tf.Output, nmsed_scores tf.Output, nmsed_classes tf.Output, valid_detections tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CombinedNonMaxSuppression",
-		Input: []tf.Input{
-			boxes, scores, max_output_size_per_class, max_total_size, iou_threshold, score_threshold,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
-//
-// Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
-
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
-//
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
-//
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
-		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mean",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the largest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Usage:
-//   ```python
-//   import tensorflow as tf
-//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
-//   b = tf.math.argmax(input = a)
-//   c = tf.keras.backend.eval(b)
-//   # c = 4
-//   # here a[4] = 166.32 which is the largest element of a across axis 0
-//   ```
-//
-// Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ArgMax",
-		Input: []tf.Input{
-			input, dimension,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a TensorList by indexing into a Tensor.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// output_handle: The TensorList.
-func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatter",
-		Input: []tf.Input{
-			tensor, indices, element_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
-
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Usage:
-//   ```python
-//   import tensorflow as tf
-//   a = [1, 10, 26.9, 2.8, 166.32, 62.3]
-//   b = tf.math.argmin(input = a)
-//   c = tf.keras.backend.eval(b)
-//   # c = 0
-//   # here a[0] = 1 which is the smallest element of a across axis 0
-//   ```
-//
-// Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ArgMin",
-		Input: []tf.Input{
-			input, dimension,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// For example:
-//
-// ```
-// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
-// tf.segment_mean(c, tf.constant([0, 0, 1]))
-// # ==> [[2.5, 2.5, 2.5, 2.5],
-// #      [5, 6, 7, 8]]
-// ```
-//
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMean",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
-//
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
-//
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-//
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes a range that covers the actual values present in a quantized tensor.
-//
-// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
-// range that covers the actual values present in that tensor. This op is typically
-// used to produce the `requested_output_min` and `requested_output_max` for
-// `Requantize`.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Computes the product along segments of a tensor.
 //
 // Read
@@ -33764,784 +38477,67 @@ func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf
 	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
+type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
 
-// Computes the minimum along segments of a tensor.
+// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the minimum such that:
-//
-// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the minimum is empty for a given segment ID `i`, it outputs the largest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::max()`.
-//
-// For example:
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 1,  2, 2, 1],
-// #       [5,  6, 7, 8]]
-// ```
-//
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMin",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SkipDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
-		Input: []tf.Input{
-			resource_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sum along sparse segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the mean along sparse segments of a tensor.
-//
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces the max pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
-
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
-//
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
-	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
-	}
-}
-
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-//
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
-//
-// Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
-//
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
-		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// A TPU core selector Op.
-//
-// This Op produces a set of TPU cores (for warm-up) or a single TPU core
-// (for regular inference) to execute the TPU program on. The output is
-// consumed by TPUPartitionedCall.
-//
-// Returns A vector 1 or more TPU cores.
-func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUOrdinalSelector",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
-type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
-
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
 // If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
 	return func(m optionalAttr) {
-		m["table_id"] = value
+		m["device_ordinal"] = value
 	}
 }
 
-// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load FTRL embedding parameters with debug support.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
+// An op that enqueues a list of input batch tensors to TPUEmbedding.
 //
 // Arguments:
-//	parameters: Value of parameters used in the FTRL optimization algorithm.
-//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
-//	linears: Value of linears used in the FTRL optimization algorithm.
-//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
-//
-//
+//	batch: A list of 1D tensors, one for each embedding table, containing the
+// indices into the tables.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 //
 // Returns the created operation.
-func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
+func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
+		Type: "EnqueueTPUEmbeddingIntegerBatch",
 		Input: []tf.Input{
-			parameters, accumulators, linears, gradient_accumulators,
+			tf.OutputList(batch), mode_override,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+//   This op is used as a placeholder in If branch functions. It doesn't provide a
+//   valid output when run, so must either be removed (e.g. replaced with a
+//   function input) or guaranteed not to be used (e.g. if mirroring an
+//   intermediate output needed for the gradient computation of the other branch).
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	dtype: The type of the output.
+//	shape:     The purported shape of the output. This is only used for shape inference;
+//     the output will not necessarily have this shape. Can be a partial shape.
+//
+// Returns     \"Fake\" output value. This should not be consumed by another op.
+func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
-
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalMaxPool function.
-//
-// Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
-
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// There are a few special cases in the value mapping:
-//
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
-//
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
-//
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
-//
-// Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
-//
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeProto",
-		Input: []tf.Input{
-			sizes, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
-
-// AllKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical and" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "All",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
-//
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
-//
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Angle",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softsign",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the complex conjugate of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
-//
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Conj",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
-type ExperimentalThreadPoolHandleAttr func(optionalAttr)
-
-// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
-//
-// value: The maximum degree of parallelism to use within operations that execute on this
-// threadpool.
-// If not specified, defaults to 1
-func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["max_intra_op_parallelism"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//	num_threads: The number of threads in the thread pool.
-//	display_name: A human-readable name for the threads that may be visible in some
-// visualizations.
-// threadpool.
-//
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
-func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolHandle",
+		Type: "FakeParam",
 
 		Attrs: attrs,
 	}
@@ -34549,983 +38545,106 @@ func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name
 	return op.Output(0)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// CumsumExclusive sets the optional exclusive attribute to value.
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["shapes"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumsum",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
-
-// UnpackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
-//
-// This is the opposite of `pack`.
-//
-// Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
-//
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num": num}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unpack",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
-	}
-	return output
-}
-
-// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
-type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var + momentum * accum, so in the end, the var you get is actually
-// var + momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum - lr * grad
-// var += accum
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyKerasMomentum",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
-//
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
-//
-// Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
-		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
-
-// IdentityReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the queued work as both the key and value.
-//
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
-//
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x * y element-wise, working on quantized buffers.
-//
-// Arguments:
-//
-//
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-//
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
-		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the update to cached logits. It is designed to be used during training.
-// It traverses the trees starting from cached tree id and cached node id and
-// calculates the updates to be pushed to the cache.
-//
-// Arguments:
-//
-//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
-// tree of prediction.
-//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
-// node of prediction.
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Rank 2 Tensor containing logits update (with respect to cached
-// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
-func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesTrainingPredict",
-		Input: []tf.Input{
-			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Max",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes requantization range per channel.
-//
-// Arguments:
-//	input: The original input tensor.
-//	input_min: The minimum value of the input tensor
-//	input_max: The maximum value of the input tensor.
-//	clip_value_max: The maximum value of the output that needs to be clipped.
-// Example: set this to 6 for Relu6.
-//
-// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
-func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
-	opspec := tf.OpSpec{
-		Type: "RequantizationRangePerChannel",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
-//
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["bad_color"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-//
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ImageSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rolls the elements of a tensor along an axis.
-//
-// The elements are shifted positively (towards larger indices) by the offset of
-// `shift` along the dimension of `axis`. Negative `shift` values will shift
-// elements in the opposite direction. Elements that roll passed the last position
-// will wrap around to the first and vice versa. Multiple shifts along multiple
-// axes may be specified.
-//
-// For example:
-//
-// ```
-// # 't' is [0, 1, 2, 3, 4]
-// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
-//
-// # shifting along multiple dimensions
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
-//
-// # shifting along the same axis multiple times
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
-// ```
-//
-// Arguments:
-//
-//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
-// elements are shifted positively (towards larger indices) along the dimension
-// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
-// direction.
-//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
-// `shift[i]` should occur. If the same axis is referenced more than once, the
-// total shift for that axis will be the sum of all the shifts that belong to that
-// axis.
-//
-// Returns Has the same shape and size as the input. The elements are shifted
-// positively (towards larger indices) by the offsets of `shift` along the
-// dimensions of `axis`.
-func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Roll",
-		Input: []tf.Input{
-			input, shift, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Looks up keys in a table, outputs the corresponding values.
-//
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
-//
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//
-//
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
-		Input: []tf.Input{
-			table_handle, keys, default_value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
-
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the real part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Real",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// An Op to sum inputs across replicated TPU instances.
-//
-// Each instance supplies its own input.
-//
-// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
-// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
-// and `B, D, F, H` as group 1. Thus we get the outputs:
-// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
-//
-// Arguments:
-//	input: The local input to the sum.
-//	group_assignment: An int32 tensor with shape
-// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-// replica ids in the ith subgroup.
-//
-// Returns The sum of all the distributed inputs.
-func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CrossReplicaSum",
-		Input: []tf.Input{
-			input, group_assignment,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Removes keys and its associated values from a table.
-//
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
-//
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
-		Input: []tf.Input{
-			table_handle, keys,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// A container for an iterator resource.
-//
-// Arguments:
-//	handle: A handle to the iterator to delete.
-//	deleter: A variant deleter.
-//
-// Returns the created operation.
-func DeleteIterator(scope *Scope, handle tf.Output, deleter tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeleteIterator",
-		Input: []tf.Input{
-			handle, deleter,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Outputs all keys and values in the table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-//
-//
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
-	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
-		Input: []tf.Input{
-			table_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
-
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["min_after_dequeue"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the (key, value) element with the smallest
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
-		Input: []tf.Input{
-			indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// HashTableV2SharedName sets the optional shared_name attribute to value.
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
 // If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
-//
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// A queue that randomizes the order of elements.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	component_types: The type of each component in a value.
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
+		Type: "RandomShuffleQueueV2",
 
 		Attrs: attrs,
 	}
@@ -35533,2859 +38652,6 @@ func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, o
 	return op.Output(0)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
-
-// TopKV2Sorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
-//
-// Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TopKV2",
-		Input: []tf.Input{
-			input, k,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
-
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
-//
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
-	return func(m optionalAttr) {
-		m["adaptative"] = value
-	}
-}
-
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
-//
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
-//
-// Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
-//
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
-		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
-}
-
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
-
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
-//
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
-//
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
-	}
-}
-
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
-//
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["max_load_factor"] = value
-	}
-}
-
-// Creates an empty hash table that uses tensors as the backing store.
-//
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
-		Input: []tf.Input{
-			empty_key, deleted_key,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Table initializer that takes two tensors for keys and values respectively.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
-//
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
-type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// Load Adadelta embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the Adadelta optimization algorithm.
-//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
-//	updates: Value of updates used in the Adadelta optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingAdadeltaParameters",
-		Input: []tf.Input{
-			parameters, accumulators, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
-//
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Extract the shape information of a JPEG-encoded image.
-//
-// This op only parses the image header, so it is much faster than DecodeJpeg.
-//
-// Arguments:
-//	contents: 0-D. The JPEG-encoded image.
-//
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts a flat index or array of flat indices into a tuple of
-//
-// coordinate arrays.
-//
-// @compatibility(numpy)
-// Equivalent to np.unravel_index
-// @end_compatibility
-//
-// Arguments:
-//	indices: An 0-D or 1-D `int` Tensor whose elements are indices into the
-// flattened version of an array of dimensions dims.
-//	dims: An 1-D `int` Tensor. The shape of the array to use for unraveling
-// indices.
-//
-// Returns An 2-D (or 1-D if indices is 0-D) tensor where each row has the
-// same shape as the indices array.
-func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnravelIndex",
-		Input: []tf.Input{
-			indices, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
-
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
-//
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
-//
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Print",
-		Input: []tf.Input{
-			input, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
-//
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
-	}
-}
-
-// Encode strings into web-safe base64 format.
-//
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
-//
-// Arguments:
-//	input: Strings to be encoded.
-//
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrintV2Attr is an optional argument to PrintV2.
-type PrintV2Attr func(optionalAttr)
-
-// PrintV2OutputStream sets the optional output_stream attribute to value.
-//
-// value: A string specifying the output stream or logging level to print to.
-// If not specified, defaults to "stderr"
-func PrintV2OutputStream(value string) PrintV2Attr {
-	return func(m optionalAttr) {
-		m["output_stream"] = value
-	}
-}
-
-// PrintV2End sets the optional end attribute to value.
-// If not specified, defaults to "\n"
-func PrintV2End(value string) PrintV2Attr {
-	return func(m optionalAttr) {
-		m["end"] = value
-	}
-}
-
-// Prints a string scalar.
-//
-// Prints a string scalar to the desired output_stream.
-//
-// Arguments:
-//	input: The string scalar to print.
-//
-// Returns the created operation.
-func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PrintV2",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
-
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
-	}
-}
-
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
-//
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["display_name"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
-//
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummary",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Makes the summary of quantiles for the batch.
-//
-// An op that takes a list of tensors (one tensor per feature) and outputs the
-// quantile summaries for each tensor.
-//
-// Arguments:
-//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
-//	example_weights: float; Rank 1 Tensor with weights per instance.
-//	epsilon: float; The required maximum approximation error.
-//
-// Returns float; List of Rank 2 Tensors each containing the quantile summary
-// (value, weight, min_rank, max_rank) of a single feature.
-func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesMakeQuantileSummaries",
-		Input: []tf.Input{
-			tf.OutputList(float_values), example_weights, epsilon,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if summaries, idx, err = makeOutputList(op, idx, "summaries"); err != nil {
-		scope.UpdateErr("BoostedTreesMakeQuantileSummaries", err)
-		return
-	}
-	return summaries
-}
-
-// Creates and returns an empty tensor list.
-//
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
-//
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
-		Input: []tf.Input{
-			element_shape, max_num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
-//
-// tensor: The tensor to put on the list.
-// input_handle: The old list.
-// output_handle: A list with the elements of the old list followed by tensor.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListPushBack",
-		Input: []tf.Input{
-			input_handle, tensor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorListStackAttr is an optional argument to TensorListStack.
-type TensorListStackAttr func(optionalAttr)
-
-// TensorListStackNumElements sets the optional num_elements attribute to value.
-// If not specified, defaults to -1
-func TensorListStackNumElements(value int64) TensorListStackAttr {
-	return func(m optionalAttr) {
-		m["num_elements"] = value
-	}
-}
-
-// Stacks all tensors in the list.
-//
-// Requires that all tensors have the same shape.
-//
-// input_handle: the input list
-// tensor: the gathered result
-// num_elements: optional. If not -1, the number of elements in the list.
-//
-func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListStack",
-		Input: []tf.Input{
-			input_handle, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorListConcatAttr is an optional argument to TensorListConcat.
-type TensorListConcatAttr func(optionalAttr)
-
-// TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// Concats all tensors in the list along the 0th dimension.
-//
-// Requires that all tensors have the same shape except the first dimension.
-//
-// input_handle: The input list.
-// tensor: The concated result.
-// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
-//
-func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListConcat",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Splits a tensor into a list.
-//
-// list[i] corresponds to lengths[i] tensors from the input tensor.
-// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
-//
-// tensor: The input tensor.
-// element_shape: A shape compatible with that of elements in the tensor.
-// lengths: Vector of sizes of the 0th dimension of tensors in the list.
-// output_handle: The list.
-func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListSplit",
-		Input: []tf.Input{
-			tensor, element_shape, lengths,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
-		Input: []tf.Input{
-			images, boxes,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the last element of the input list as well as a list with all but that element.
-//
-// Fails if the list is empty.
-//
-// input_handle: the input list
-// tensor: the withdrawn last element of the list
-// element_dtype: the type of elements in the list
-// element_shape: the shape of the output tensor
-func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListPopBack",
-		Input: []tf.Input{
-			input_handle, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Creates a TensorList which, when stacked, has the value of `tensor`.
-//
-// Each tensor in the result list corresponds to one row of the input tensor.
-//
-// tensor: The input tensor.
-// output_handle: The list.
-func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListFromTensor",
-		Input: []tf.Input{
-			tensor, element_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Resizes the list.
-//
-//
-// input_handle: the input list
-// size: size of the output list
-//
-func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListResize",
-		Input: []tf.Input{
-			input_handle, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a Tensor by indexing into the TensorList.
-//
-// Each row in the produced Tensor corresponds to the element in the TensorList
-// specified by the given index (see `tf.gather`).
-//
-// input_handle: The input tensor list.
-// indices: The indices used to index into the list.
-// values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListGather",
-		Input: []tf.Input{
-			input_handle, indices, element_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a tensor filled with a scalar value.
-//
-// This operation creates a tensor of shape `dims` and fills it with `value`.
-//
-// For example:
-//
-// ```
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
-//
-// `tf.fill` differs from `tf.constant` in a few ways:
-//
-// *   `tf.fill` only supports scalar contents, whereas `tf.constant` supports
-//     Tensor values.
-// *   `tf.fill` creates an Op in the computation graph that constructs the actual
-//     Tensor value at runtime. This is in contrast to `tf.constant` which embeds
-//     the entire Tensor into the graph with a `Const` node.
-// *   Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes
-//     based on other runtime Tensors, unlike `tf.constant`.
-//
-// Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fill",
-		Input: []tf.Input{
-			dims, value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Scatters tensor at indices in an input list.
-//
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
-//
-// input_handle: The list to scatter into.
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// output_handle: The TensorList.
-func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatterIntoExistingList",
-		Input: []tf.Input{
-			input_handle, tensor, indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
-//
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
-//
-// Arguments:
-//	input: Shape is `[N, M, M]`.
-//
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = min(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMin",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
-// for an explanation of segments.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated, use python implementation tf.linalg.matrix_exponential.
-//
-// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// \\(log(exp(A)) = A\\)
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
-//
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
-//
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
-//
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
-		Input: []tf.Input{
-			l, grad,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Constructs an Optional variant from a tuple of tensors.
-func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalFromValue",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
-//
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
-//
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
-		Input: []tf.Input{
-			matrix, rhs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
-//
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
-//
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the singular value decompositions of one or more matrices.
-//
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
-//
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Svd",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// NonDeterministicIntsAttr is an optional argument to NonDeterministicInts.
-type NonDeterministicIntsAttr func(optionalAttr)
-
-// NonDeterministicIntsDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_INT64
-func NonDeterministicIntsDtype(value tf.DataType) NonDeterministicIntsAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Non-deterministically generates some integers.
-//
-// This op may use some OS-provided source of non-determinism (e.g. an RNG), so each execution will give different results.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//
-// Returns Non-deterministic integer values with specified shape.
-func NonDeterministicInts(scope *Scope, shape tf.Output, optional ...NonDeterministicIntsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonDeterministicInts",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Calculate product with tridiagonal matrix.
-//
-// Calculates product of two matrices, where left matrix is a tridiagonal matrix.
-//
-// Arguments:
-//	superdiag: Tensor of shape `[..., 1, M]`, representing superdiagonals of
-// tri-diagonal matrices to the left of multiplication. Last element is ingored.
-//	maindiag: Tensor of shape `[..., 1, M]`, representing main diagonals of tri-diagonal
-// matrices to the left of multiplication.
-//	subdiag: Tensor of shape `[..., 1, M]`, representing subdiagonals of tri-diagonal
-// matrices to the left of multiplication. First element is ingored.
-//	rhs: Tensor of shape `[..., M, N]`, representing MxN matrices to the right of
-// multiplication.
-//
-// Returns Tensor of shape `[..., M, N]` containing the product.
-func TridiagonalMatMul(scope *Scope, superdiag tf.Output, maindiag tf.Output, subdiag tf.Output, rhs tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TridiagonalMatMul",
-		Input: []tf.Input{
-			superdiag, maindiag, subdiag, rhs,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArraySizeV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TridiagonalSolveAttr is an optional argument to TridiagonalSolve.
-type TridiagonalSolveAttr func(optionalAttr)
-
-// TridiagonalSolvePartialPivoting sets the optional partial_pivoting attribute to value.
-//
-// value: Whether to apply partial pivoting. Partial pivoting makes the procedure more
-// stable, but slower.
-// If not specified, defaults to true
-func TridiagonalSolvePartialPivoting(value bool) TridiagonalSolveAttr {
-	return func(m optionalAttr) {
-		m["partial_pivoting"] = value
-	}
-}
-
-// Solves tridiagonal systems of equations.
-//
-//   Solves tridiagonal systems of equations.
-//   Supports batch dimensions and multiple right-hand sides per each left-hand
-//   side.
-//   On CPU, solution is computed via Gaussian elimination with or without partial
-//   pivoting, depending on `partial_pivoting` attribute. On GPU, Nvidia's cuSPARSE
-//   library is used: https://docs.nvidia.com/cuda/cusparse/index.html#gtsv
-//
-// Arguments:
-//	diagonals: Tensor of shape `[..., 3, M]` whose innermost 2 dimensions represent the
-// tridiagonal matrices with three rows being the superdiagonal, diagonals, and
-// subdiagonals, in order. The last element of the superdiagonal and the first
-// element of the subdiagonal is ignored.
-//	rhs: Tensor of shape `[..., M, K]`, representing K right-hand sides per each
-// left-hand side.
-//
-// Returns Tensor of shape `[..., M, K]` containing the solutions
-func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output, optional ...TridiagonalSolveAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TridiagonalSolve",
-		Input: []tf.Input{
-			diagonals, rhs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherBatchDims sets the optional batch_dims attribute to value.
-// If not specified, defaults to 0
-func ResourceGatherBatchDims(value int64) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["batch_dims"] = value
-	}
-}
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceGather",
-		Input: []tf.Input{
-			resource, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
-//
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Save",
-		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
-
-// PreventGradientMessage sets the optional message attribute to value.
-//
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// An identity op that triggers an error if a gradient is requested.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
-//
-// Arguments:
-//	input: any tensor.
-//
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PreventGradient",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
-
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
-//
-// See also `RestoreSlice`.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Restore",
-		Input: []tf.Input{
-			file_pattern, tensor_name,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
-		Input: []tf.Input{
-			basename, num_shards,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
-//
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
-//
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
-type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
-
-// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a TensorForestTreeResource
-func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorForestTreeResourceHandleOp",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
-//
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
-//
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
-//
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
-//
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
-//
-// Arguments:
-//	record_bytes: Number of bytes in the record.
-//
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
-
-// TFRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
-}
-
-// A Reader that outputs the records from a TensorFlow Records file.
-//
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
-//
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
-//
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
-
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
-		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeArea",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
-
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBicubicHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBicubicHalfPixelCenters(value bool) ResizeBicubicAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using bicubic interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Selects the k nearest centers for each point.
-//
-// Rows of points are assumed to be input points. Rows of centers are assumed to be
-// the list of candidate centers. For each point, the k centers that have least L2
-// distance to it are computed.
-//
-// Arguments:
-//	points: Matrix of shape (n, d). Rows are assumed to be input points.
-//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
-//	k: Number of nearest centers to return for each point. If k is larger than m, then
-// only m centers are returned.
-//
-// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
-// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
-// corresponding center in nearest_center_indices.
-func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NearestNeighbors",
-		Input: []tf.Input{
-			points, centers, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
-
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBicubicGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBicubicGradHalfPixelCenters(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Computes the gradient of bicubic interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
-
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeBilinearGradHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeBilinearGradHalfPixelCenters(value bool) ResizeBilinearGradAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Computes the gradient of bilinear interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// JPEG encode input image with provided compression quality.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-// `quality` is an int32 jpeg compression quality value between 0 and 100.
-//
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	quality: An int quality to encode to.
-//
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeJpegVariableQuality",
-		Input: []tf.Input{
-			images, quality,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Disallowed in GraphDef version >= 2.
-//
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
-		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Adjust the contrast of one or more images.
 //
 // `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
@@ -38417,356 +38683,50 @@ func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output)
 	return op.Output(0)
 }
 
-// Pads a tensor with mirrored values.
-//
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
-//
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// DecodePngChannels sets the optional channels attribute to value.
 //
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
-//
-// Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
-//
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
-		Input: []tf.Input{
-			pattern,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TryRpcAttr is an optional argument to TryRpc.
-type TryRpcAttr func(optionalAttr)
-
-// TryRpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func TryRpcProtocol(value string) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// TryRpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func TryRpcFailFast(value bool) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
-
-// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
+// value: Number of color channels for the decoded image.
 // If not specified, defaults to 0
-func TryRpcTimeoutInMs(value int64) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
-	}
-}
-
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
-//
-// Unlike the standard `Rpc` op, if the connection fails or the remote worker
-// returns an error status, this op does **not** reraise the exception.
-// Instead, the `status_code` and `status_message` entry for the corresponding RPC
-// call is set with the error returned from the RPC call.  The `response` tensor
-// will contain valid response values for those minibatch entries whose RPCs did
-// not fail; the rest of the entries will have empty strings.
-//
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
-//
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
-// returned from the RPC calls.
-func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TryRpc",
-		Input: []tf.Input{
-			address, method, request,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
-		Input: []tf.Input{
-			resource,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
-
-// EncodePngCompression sets the optional compression attribute to value.
-//
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
-	return func(m optionalAttr) {
-		m["compression"] = value
-	}
-}
-
-// PNG-encode an image.
-//
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
-//
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
-//
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
-//
-// Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
-//
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodePng",
-		Input: []tf.Input{
-			image,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
-
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
+func DecodePngChannels(value int64) DecodePngAttr {
 	return func(m optionalAttr) {
 		m["channels"] = value
 	}
 }
 
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
 // The attr `channels` indicates the desired number of color channels for the
 // decoded image.
 //
 // Accepted values are:
 //
-// *   0: Use the number of channels in the BMP-encoded image.
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
 // *   3: output an RGB image.
 // *   4: output an RGBA image.
 //
-// Arguments:
-//	contents: 0-D.  The BMP-encoded image.
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
 //
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The PNG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -38775,7 +38735,7 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
+		Type: "DecodePng",
 		Input: []tf.Input{
 			contents,
 		},
@@ -38785,207 +38745,52 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
-// for an explanation of segments.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// ``` python
-// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
-// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
-// # ==> [[ 5,  5, 5, 5],
-// #       [5,  6, 7, 8]]
-// ```
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "SparseReduceSumSparse",
 		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Decode the frame(s) of a GIF-encoded image to a uint8 tensor.
-//
-// GIF images with frame or transparency compression are not supported.
-// On Linux and MacOS systems, convert animated GIFs from compressed to
-// uncompressed by running:
-//
-//     convert $src.gif -coalesce $dst.gif
-//
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The GIF-encoded image.
-//
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB channel order.
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeGif",
-		Input: []tf.Input{
-			contents,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
-
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -38993,32 +38798,66 @@ func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true i
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StatefulTruncatedNormalAttr is an optional argument to StatefulTruncatedNormal.
-type StatefulTruncatedNormalAttr func(optionalAttr)
+// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
+type ResourceScatterNdSubAttr func(optionalAttr)
 
-// StatefulTruncatedNormalDtype sets the optional dtype attribute to value.
+// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatefulTruncatedNormalDtype(value tf.DataType) StatefulTruncatedNormalAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs random values from a truncated normal distribution.
+// Applies sparse subtraction to individual values or slices in a Variable.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
+//
+// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+// with 8 elements. In Python, that subtraction would look like this:
+//
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// sub = tf.scatter_nd_sub(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(sub)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, -9, 3, -6, -4, 6, 7, -4]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
 //
-// Returns Random values with specified shape.
-func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulTruncatedNormalAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -39027,46 +38866,13 @@ func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatefulTruncatedNormal",
+		Type: "ResourceScatterNdSub",
 		Input: []tf.Input{
-			resource, algorithm, shape,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	resource: The handle of the resource variable that stores the state of the RNG.
-//	algorithm: The RNG algorithm.
-//	shape: The shape of the output tensor.
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
-//
-// Returns Random values with specified shape.
-func StatefulUniformInt(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatefulUniformInt",
-		Input: []tf.Input{
-			resource, algorithm, shape, minval, maxval,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // Converts one or more images from RGB to HSV.
@@ -39097,76 +38903,26 @@ func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIgnoreErrorsDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// Convert one or more images from HSV to RGB.
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
-//	colors: 2-D. A list of RGBA colors to cycle through for the boxes.
-//
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxesV2(scope *Scope, images tf.Output, boxes tf.Output, colors tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxesV2",
-		Input: []tf.Input{
-			images, boxes, colors,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -39326,240 +39082,99 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
-
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// Converts the given `resource_handle` representing an iterator to a string.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	resource_handle: A handle to an iterator resource.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvGrad",
+		Type: "IteratorToStringHandle",
 		Input: []tf.Input{
-			y, dy,
+			resource_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
-
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+// Decode the frame(s) of a GIF-encoded image to a uint8 tensor.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+// GIF images with frame or transparency compression are not supported.
+// On Linux and MacOS systems, convert animated GIFs from compressed to
+// uncompressed by running:
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//     convert $src.gif -coalesce $dst.gif
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
-//
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
-	}
-}
-
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
+//	contents: 0-D.  The GIF-encoded image.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB channel order.
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			contents,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
 
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
+// value: A string specifying the sampling method for resizing. It can be either
+// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
+// methods are supported: Bilinear and Nearest Neighbor.
 // If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+func CropAndResizeMethod(value string) CropAndResizeAttr {
 	return func(m optionalAttr) {
 		m["method"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+//
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["extrapolation_value"] = value
+	}
+}
+
+// Extracts crops from the input image tensor and resizes them.
+//
+// Extracts crops from the input image tensor and resizes them using bilinear
+// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+// common output size specified by `crop_size`. This is more general than the
+// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
+// and does not allow resizing or aspect ratio change.
+//
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear or nearest neighbor interpolation) to a fixed
+// `size = [crop_height, crop_width]`. The result is a 4-D tensor
+// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+// results to using `tf.image.resize_bilinear()` or
+// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+// `align_corners=True`.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
 //	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
 // Both `image_height` and `image_width` need to be positive.
 //	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
@@ -39567,16 +39182,20 @@ func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 // in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
 // `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
 // `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
 // which case the sampled crop is an up-down flipped version of the original
 // image. The width dimension is treated similarly. Normalized coordinates
 // outside the `[0, 1]` range are allowed, in which case we use
 // `extrapolation_value` to extrapolate the input image values.
 //	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
 // The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -39585,9 +39204,9 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			grads, image, boxes, box_ind,
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
@@ -39595,157 +39214,190 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 	return op.Output(0)
 }
 
-// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
-type NonMaxSuppressionV4Attr func(optionalAttr)
-
-// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+// Produces a string handle for the given MultiDeviceIterator.
 //
-// value: If true, the output `selected_indices` is padded to be of length
-// `max_output_size`. Defaults to false.
-// If not specified, defaults to false
-func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+// Arguments:
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//
+// Returns A string representing the resource.
+func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIteratorToStringHandle",
+		Input: []tf.Input{
+			multi_device_iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["pad_to_max_output_size"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
-// `selected_indices`, with the valid elements appearing first.
-func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV4",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns which elements of x are Inf.
-//
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "IsInf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
 //
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+			set1, set2_indices, set2_values, set2_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Bucketize each feature based on bucket boundaries.
+// An op that receives embedding activations on the TPU.
 //
-// An op that returns a list of float tensors, where each tensor represents the
-// bucketized values for a single feature.
+// The TPU system performs the embedding lookups and aggregations specified by
+// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+// results of these aggregations are visible to the Tensorflow Graph as the
+// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+// one Tensor of activations per table specified in the model. There can be at
+// most one RecvTPUEmbeddingActivations op in the TPU graph.
 //
 // Arguments:
-//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
-//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
-// feature.
+//	num_outputs: The number of output activation tensors, equal to the number of
+// embedding tables in the model.
+//	config: Serialized TPUEmbeddingConfiguration proto.
 //
-// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
-func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
+// Returns A TensorList of embedding activations containing one Tensor per
+// embedding table in the model.
+func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesBucketize",
-		Input: []tf.Input{
-			tf.OutputList(float_values), tf.OutputList(bucket_boundaries),
-		},
+		Type: "RecvTPUEmbeddingActivations",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
@@ -39753,11 +39405,114 @@ func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_bounda
 	}
 	var idx int
 	var err error
-	if buckets, idx, err = makeOutputList(op, idx, "buckets"); err != nil {
-		scope.UpdateErr("BoostedTreesBucketize", err)
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
 		return
 	}
-	return buckets
+	return outputs
+}
+
+// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
+type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingMomentumParameters",
+		Input: []tf.Input{
+			parameters, momenta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
+
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyPowerSign",
+		Input: []tf.Input{
+			var_, m, lr, logbase, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
 }
 
 // Set a summary_writer_interface to record statistics using given stats_aggregator.
@@ -39776,78 +39531,6 @@ func StatsAggregatorSetSummaryWriter(scope *Scope, stats_aggregator tf.Output, s
 	return scope.AddOperation(opspec)
 }
 
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
-
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
-}
-
-// Performs beam search decoding on the logits given in input.
-//
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
-//
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
-//
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
-		Input: []tf.Input{
-			inputs, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
-}
-
 // Creates a dataset that shards the input dataset.
 //
 // Creates a dataset that shards the input dataset by num_workers, returning a
@@ -39880,199 +39563,85 @@ func ExperimentalAutoShardDataset(scope *Scope, input_dataset tf.Output, num_wor
 	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
+// Returns the cardinality of `input_dataset`.
 //
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
-//
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
-//
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// Returns the cardinality of `input_dataset`.
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	input_dataset: A variant tensor representing the dataset to return cardinality for.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Returns The cardinality of `input_dataset`. Named constants are used to represent
+// infinite and unknown cardinality.
+func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "ExperimentalDatasetCardinality",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			input_dataset,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
-	}
-	return tensors
-}
-
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalBytesProducedStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-// Graphically this is equivalent to doing
-//
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that batches input elements into a SparseTensor.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalDenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
 
-// AvgPoolDataFormat sets the optional data_format attribute to value.
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Performs average pooling on the input.
-//
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// Performs 3D average pooling on the input.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
 //
 // Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -40081,9 +39650,9 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
-			value,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -40091,16 +39660,53 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorDataset",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -40108,6 +39714,184 @@ func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shap
 	return op.Output(0)
 }
 
+// Computes the trignometric inverse sine of x element-wise.
+//
+// The `tf.math.asin` operation returns the inverse of `tf.math.sin`, such that
+// if `y = tf.math.sin(x)` then, `x = tf.math.asin(y)`.
+//
+// **Note**: The output of `tf.math.asin` will lie within the invertible range
+// of sine, i.e [-pi/2, pi/2].
+//
+// For example:
+//
+// ```python
+// # Note: [1.047, 0.785] ~= [(pi/3), (pi/4)]
+// x = tf.constant([1.047, 0.785])
+// y = tf.math.sin(x) # [0.8659266, 0.7068252]
+//
+// tf.math.asin(y) # [1.047, 0.785] = x
+// ```
+//
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of x AND y element-wise.
+//
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a Dataset that returns pseudorandom numbers.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRandomDataset",
+		Input: []tf.Input{
+			seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that passes a sliding window over `input_dataset`.
+//
+// Arguments:
+//
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//
+//
+func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalSlidingWindowDataset",
+		Input: []tf.Input{
+			input_dataset, window_size, window_shift, window_stride,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGrad",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
 // ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
 type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
 
@@ -40145,31 +39929,89 @@ func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalSta
 	return op.Output(0)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalStatsAggregatorSummary",
-		Input: []tf.Input{
-			iterator,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gets the next output from the given iterator .
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// A dataset that splits the elements of its input into multiple elements.
+func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
+		Type: "ExperimentalUnbatchDataset",
 		Input: []tf.Input{
-			iterator,
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains the unique elements of `input_dataset`.
+func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUniqueDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gather ragged slices from `params` axis `0` according to `indices`.
+//
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
+//
+// ```python
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+// ```
+//
+// where
+//
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
+//
+//
+// Arguments:
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
+//
+// Returns The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
+	opspec := tf.OpSpec{
+		Type: "RaggedGather",
+		Input: []tf.Input{
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
 		},
 		Attrs: attrs,
 	}
@@ -40179,11 +40021,12 @@ func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataTyp
 	}
 	var idx int
 	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
 		return
 	}
-	return components
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
 // Creates a dataset that overrides the maximum intra-op parallelism.
@@ -40209,32 +40052,131 @@ func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Out
 	return op.Output(0)
 }
 
-// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
-type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+// Pop the element at the top of the stack.
 //
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
+//
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	opspec := tf.OpSpec{
+		Type: "StackPopV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMin",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["device_ordinal"] = value
+		m["exclusive"] = value
 	}
 }
 
-// An op that enqueues a list of input batch tensors to TPUEmbedding.
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
 //
 // Arguments:
-//	batch: A list of 1D tensors, one for each embedding table, containing the
-// indices into the tables.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -40243,15 +40185,260 @@ func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_overr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingIntegerBatch",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			tf.OutputList(batch), mode_override,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//
+//
+func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, thread_pool,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
+
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+//
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+	return func(m optionalAttr) {
+		m["compute_v"] = value
+	}
+}
+
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
+//
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
+//
+// Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
+//
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEigV2",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Removes keys and its associated values from a table.
+//
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
+//
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableRemoveV2",
+		Input: []tf.Input{
+			table_handle, keys,
+		},
+	}
 	return scope.AddOperation(opspec)
 }
 
+// JPEG encode input image with provided compression quality.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+// `quality` is an int32 jpeg compression quality value between 0 and 100.
+//
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	quality: An int quality to encode to.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpegVariableQuality",
+		Input: []tf.Input{
+			images, quality,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
+
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// The op serializes protobuf messages provided in the input tensors.
+//
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
+//
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
+//
+// Arguments:
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
+//
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeProto",
+		Input: []tf.Input{
+			sizes, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
 type DecodeProtoV2Attr func(optionalAttr)
 
@@ -40379,16 +40566,81 @@ func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_nam
 	return sizes, values
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			input_dataset, another_dataset,
+			indices, values, dense_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Delete the TensorArray from its resource container.
+//
+// This enables the user to close and release the resource in the middle
+// of a step/run.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV3",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical or" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Any",
+		Input: []tf.Input{
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -40396,87 +40648,16 @@ func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset t
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
-//
-//
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "ZipDataset",
 		Input: []tf.Input{
-			input_dataset, count,
+			tf.OutputList(input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -40484,24 +40665,40 @@ func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, outpu
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
+//
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
+	return func(m optionalAttr) {
+		m["stack_name"] = value
+	}
+}
+
+// A stack that produces elements in first-in last-out order.
 //
 // Arguments:
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
-//
-//
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "StackV2",
 		Input: []tf.Input{
-			input_dataset, count,
+			max_size,
 		},
 		Attrs: attrs,
 	}
diff --git a/tensorflow/go/test.sh b/tensorflow/go/test.sh
index b75076563f6..3ea8a0529a7 100755
--- a/tensorflow/go/test.sh
+++ b/tensorflow/go/test.sh
@@ -42,7 +42,7 @@ fi
 export GOPATH="${TEST_TMPDIR}/go"
 export GOCACHE="${TEST_TMPDIR}/cache"
 mkdir -p "${GOPATH}/src/github.com/tensorflow"
-ln -s "${PWD}" "${GOPATH}/src/github.com/tensorflow/tensorflow"
+ln -s -f "${PWD}" "${GOPATH}/src/github.com/tensorflow/tensorflow"
 
 # Ensure that the TensorFlow C library is accessible to the
 # linker at build and run time.
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 6a71cd1e9da..aa5d50c2d8f 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -11,10 +11,10 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_binary_additional_srcs",
     "tf_cc_binary",
+    "tf_cc_test",
     "tf_copts",
     "tf_custom_op_library",
     "tf_java_test",
-    "tf_cc_test",
 )
 
 java_library(
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 3eb8c5c7129..ca28c8c5972 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -413,7 +413,7 @@ void RenderOptionsClass(const OpSpec& op, const Type& op_class,
 inline Type ClassOf(const EndpointSpec& endpoint, const string& base_package) {
   return Type::Class(
       endpoint.name(),
-      base_package + "." + str_util::Lowercase(endpoint.package()));
+      base_package + "." + absl::AsciiStrToLower(endpoint.package()));
 }
 
 void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 4024efedefd..b8974f8d2cf 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -130,7 +130,7 @@ std::pair<Type, Type> TypeResolver::TypesOf(const OpDef_AttrDef& attr_def,
   std::pair<Type, Type> types = MakeTypePair(Type::Wildcard());
   *iterable_out = false;
   StringPiece attr_type = attr_def.type();
-  if (str_util::ConsumePrefix(&attr_type, "list(")) {
+  if (absl::ConsumePrefix(&attr_type, "list(")) {
     attr_type.remove_suffix(1);  // remove closing brace
     *iterable_out = true;
   }
@@ -208,7 +208,7 @@ string ParseDocumentation(const string& inp) {
   markups_subexpr.push_back("`+");           // inlined code and code blocks
   markups_subexpr.push_back("\\*{1,2}\\b");  // text emphasis
   markups_subexpr.push_back("\\[");          // hyperlinks
-  const RE2 markup_expr("(" + str_util::Join(markups_subexpr, "|") + ")");
+  const RE2 markup_expr("(" + absl::StrJoin(markups_subexpr, "|") + ")");
 
   bool in_list = false;
   string input = inp;
@@ -219,9 +219,9 @@ string ParseDocumentation(const string& inp) {
       break;  // end of loop
     }
     javadoc_text << text;
-    if (str_util::StartsWith(markup, "\n")) {
+    if (absl::StartsWith(markup, "\n")) {
       javadoc_text << "\n";
-      if (str_util::StrContains(markup, "*")) {
+      if (absl::StrContains(markup, "*")) {
         // new list item
         javadoc_text << (in_list ? "</li>\n" : "<ul>\n") << "<li>\n";
         in_list = true;
@@ -229,18 +229,18 @@ string ParseDocumentation(const string& inp) {
         // end of list
         javadoc_text << "</li>\n</ul>\n";
         in_list = false;
-      } else if (!str_util::StartsWith(input, "```")) {
+      } else if (!absl::StartsWith(input, "```")) {
         // new paragraph (not required if a <pre> block follows)
         javadoc_text << "<p>\n";
       }
-    } else if (str_util::StartsWith(markup, "```")) {
+    } else if (absl::StartsWith(markup, "```")) {
       // code blocks
       if (FindAndCut(&input, "(```\\s*\n*)", &text)) {
         javadoc_text << "<pre>{@code\n" << text << "}</pre>\n";
       } else {
         javadoc_text << markup;
       }
-    } else if (str_util::StartsWith("(" + markup + ")", "`")) {
+    } else if (absl::StartsWith("(" + markup + ")", "`")) {
       // inlined code
       if (FindAndCut(&input, markup, &text)) {
         javadoc_text << "{@code " << text << "}";
@@ -261,13 +261,13 @@ string ParseDocumentation(const string& inp) {
       } else {
         javadoc_text << markup;
       }
-    } else if (str_util::StartsWith(markup, "[")) {
+    } else if (absl::StartsWith(markup, "[")) {
       // hyperlinks
       string label;
       string link;
       if (RE2::PartialMatch(input, "([^\\[]+)\\]\\((http.+)\\)", &label,
                             &link) &&
-          str_util::StartsWith(input, label + link)) {
+          absl::StartsWith(input, label + link)) {
         input = input.substr(label.size() + link.size());
         javadoc_text << "<a href=\"" << link << "\">"
                      << ParseDocumentation(label) << "</a>";
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD b/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
index a7fc8588359..10eb713d43f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # TensorFlow Java examples.
 
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 java_binary(
     name = "label_image",
diff --git a/tensorflow/js/BUILD b/tensorflow/js/BUILD
index ad0dc44f549..16a2b4af8f0 100644
--- a/tensorflow/js/BUILD
+++ b/tensorflow/js/BUILD
@@ -5,9 +5,10 @@ visibility = [
     "//tensorflow:internal",
 ]
 
-package(default_visibility = visibility)
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = visibility,
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/js/ops/ts_op_gen_test.cc b/tensorflow/js/ops/ts_op_gen_test.cc
index 1c51dd030f5..c137d0606b3 100644
--- a/tensorflow/js/ops/ts_op_gen_test.cc
+++ b/tensorflow/js/ops/ts_op_gen_test.cc
@@ -27,12 +27,12 @@ namespace tensorflow {
 namespace {
 
 void ExpectContainsStr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(str_util::StrContains(s, expected))
+  EXPECT_TRUE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
 void ExpectDoesNotContainStr(StringPiece s, StringPiece expected) {
-  EXPECT_FALSE(str_util::StrContains(s, expected))
+  EXPECT_FALSE(absl::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index f43b8fd4c17..47a8dd01a63 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
 load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 582ec7144b5..71656921d3c 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -247,6 +247,7 @@ def generated_test_models():
         "elu",
         "equal",
         "exp",
+        "embedding_lookup",
         "expand_dims",
         "eye",
         "fill",
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 661b648550c..ca625e0fda9 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 cc_library(
     name = "c_api_internal",
     srcs = ["c_api_internal.c"],
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index a760eba1dce..283d15de67b 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -134,6 +134,12 @@ typedef struct {
 
   // Parameters for FullyConnected version 2 or above.
   TfLiteFullyConnectedWeightsFormat weights_format;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimensions in the input and the output
+  // tensors are the same. Furthermore, all but the last dimension of the input
+  // and output shapes will be equal.
+  bool keep_num_dims;
 } TfLiteFullyConnectedParams;
 
 typedef enum {
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 17eeed6a687..960a7342d0f 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 cc_library(
diff --git a/tensorflow/lite/core/api/error_reporter.h b/tensorflow/lite/core/api/error_reporter.h
index 357722cc459..fcde1c4bcda 100644
--- a/tensorflow/lite/core/api/error_reporter.h
+++ b/tensorflow/lite/core/api/error_reporter.h
@@ -19,19 +19,19 @@ limitations under the License.
 
 namespace tflite {
 
-// A functor that reports error to supporting system. Invoked similar to
-// printf.
-//
-// Usage:
-//  ErrorReporter foo;
-//  foo.Report("test %d", 5);
-// or
-//  va_list args;
-//  foo.Report("test %d", args); // where args is va_list
-//
-// Subclass ErrorReporter to provide another reporting destination.
-// For example, if you have a GUI program, you might redirect to a buffer
-// that drives a GUI error log box.
+/// A functor that reports error to supporting system. Invoked similar to
+/// printf.
+///
+/// Usage:
+///  ErrorReporter foo;
+///  foo.Report("test %d", 5);
+/// or
+///  va_list args;
+///  foo.Report("test %d", args); // where args is va_list
+///
+/// Subclass ErrorReporter to provide another reporting destination.
+/// For example, if you have a GUI program, you might redirect to a buffer
+/// that drives a GUI error log box.
 class ErrorReporter {
  public:
   virtual ~ErrorReporter() {}
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 92ea8837be9..5d31d39adee 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -290,6 +290,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_FullyConnectedOptions()) {
         params->activation = parse_activation(
             fully_connected_params->fused_activation_function());
+        params->keep_num_dims = fully_connected_params->keep_num_dims();
         switch (fully_connected_params->weights_format()) {
           case FullyConnectedOptionsWeightsFormat_DEFAULT:
             params->weights_format = kTfLiteFullyConnectedWeightsFormatDefault;
diff --git a/tensorflow/lite/core/api/op_resolver.h b/tensorflow/lite/core/api/op_resolver.h
index c8c7479f334..c3c23dbee92 100644
--- a/tensorflow/lite/core/api/op_resolver.h
+++ b/tensorflow/lite/core/api/op_resolver.h
@@ -21,22 +21,23 @@ limitations under the License.
 
 namespace tflite {
 
-// Abstract interface that returns TfLiteRegistrations given op codes or custom
-// op names. This is the mechanism that ops being referenced in the flatbuffer
-// model are mapped to executable function pointers (TfLiteRegistrations).
+/// Abstract interface that returns TfLiteRegistrations given op codes or custom
+/// op names. This is the mechanism that ops being referenced in the flatbuffer
+/// model are mapped to executable function pointers (TfLiteRegistrations).
 class OpResolver {
  public:
-  // Finds the op registration for a builtin operator by enum code.
+  /// Finds the op registration for a builtin operator by enum code.
   virtual const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
                                            int version) const = 0;
-  // Finds the op registration of a custom operator by op name.
+  /// Finds the op registration of a custom operator by op name.
   virtual const TfLiteRegistration* FindOp(const char* op,
                                            int version) const = 0;
   virtual ~OpResolver() {}
 };
 
 // Handles the logic for converting between an OperatorCode structure extracted
-// from a flatbuffer and information about a registered operator implementation.
+// from a flatbuffer and information about a registered operator
+// implementation.
 TfLiteStatus GetRegistrationFromOpCode(const OperatorCode* opcode,
                                        const OpResolver& op_resolver,
                                        ErrorReporter* error_reporter,
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index afa2d63f64f..e602d2201cb 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "tensorflow/lite/core/subgraph.h"
 
-#include <complex>
-
 #include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 
@@ -283,6 +283,16 @@ TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
 TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
     TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
     TfLiteDelegate* delegate) {
+  // Ignore empty node replacement sets.
+  if (!nodes_to_replace->size) {
+    return kTfLiteOk;
+  }
+
+  TFLITE_LOG(tflite::TFLITE_LOG_INFO,
+             "Replacing %d node(s) with delegate (%s) node.",
+             nodes_to_replace->size,
+             registration.custom_name ? registration.custom_name : "unknown");
+
   // Annotate the registration as DELEGATE op.
   registration.builtin_code = BuiltinOperator_DELEGATE;
 
@@ -444,40 +454,9 @@ TfLiteStatus Subgraph::BytesRequired(TfLiteType type, const int* dims,
   TF_LITE_ENSURE(context_, bytes != nullptr);
   size_t count = 1;
   for (int k = 0; k < dims_size; k++) count *= dims[k];
-  switch (type) {
-    case kTfLiteFloat32:
-      *bytes = sizeof(float) * count;
-      break;
-    case kTfLiteInt16:
-      *bytes = sizeof(int16_t) * count;
-      break;
-    case kTfLiteInt32:
-      *bytes = sizeof(int32_t) * count;
-      break;
-    case kTfLiteUInt8:
-      *bytes = sizeof(uint8_t) * count;
-      break;
-    case kTfLiteInt64:
-      *bytes = sizeof(int64_t) * count;
-      break;
-    case kTfLiteBool:
-      *bytes = sizeof(bool) * count;
-      break;
-    case kTfLiteComplex64:
-      *bytes = sizeof(std::complex<float>) * count;
-      break;
-    case kTfLiteInt8:
-      *bytes = sizeof(int8_t) * count;
-      break;
-    case kTfLiteFloat16:
-      *bytes = sizeof(TfLiteFloat16) * count;
-      break;
-    default:
-      ReportError(
-          "Only float32, int8, int16, int32, int64, uint8, bool, complex64 "
-          "supported currently.");
-      return kTfLiteError;
-  }
+  size_t type_size = 0;
+  TF_LITE_ENSURE_OK(context_, GetSizeOfType(context_, type, &type_size));
+  *bytes = type_size * count;
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 43c3d5f6eb0..bf8790d381a 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -1,11 +1,13 @@
 #
 # This is a TF Lite delegate that is powered by TensorFlow's Eager.
 #
-package(default_visibility = [
-    "//visibility:private",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow/compiler/mlir/lite:__subpackages__",
+        "//tensorflow/lite/toco/tflite:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
@@ -241,3 +243,13 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_library(
+    name = "whitelisted_flex_ops_lib",
+    srcs = [
+        "whitelisted_flex_ops.cc",
+    ],
+    hdrs = [
+        "whitelisted_flex_ops.h",
+    ],
+)
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
similarity index 99%
rename from tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
rename to tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index 9123b000498..2afcd73ae30 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/toco/tflite/whitelisted_flex_ops.h"
+#include "tensorflow/lite/delegates/flex/whitelisted_flex_ops.h"
 
 #include <set>
 
-namespace toco {
 namespace tflite {
+namespace flex {
 
 bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
   static const std::set<std::string>* whitelisted_flex_ops =
@@ -473,5 +473,5 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
          whitelisted_flex_ops->end();
 }
 
+}  // namespace flex
 }  // namespace tflite
-}  // namespace toco
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.h b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.h
similarity index 84%
rename from tensorflow/lite/toco/tflite/whitelisted_flex_ops.h
rename to tensorflow/lite/delegates/flex/whitelisted_flex_ops.h
index 2559a705285..189a6940536 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.h
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_TOCO_TFLITE_WHITELISTED_FLEX_OPS_H_
-#define TENSORFLOW_LITE_TOCO_TFLITE_WHITELISTED_FLEX_OPS_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_WHITELISTED_FLEX_OPS_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_WHITELISTED_FLEX_OPS_H_
 
 #include <string>
 
-namespace toco {
 namespace tflite {
+namespace flex {
 
 // Whether the given op has been statically whitelisted for flex export.
 //
@@ -29,7 +29,7 @@ namespace tflite {
 // TODO(b/118389105): Automate generation of the whitelisted flex ops.
 bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name);
 
+}  // namespace flex
 }  // namespace tflite
-}  // namespace toco
 
-#endif  // TENSORFLOW_LITE_TOCO_TFLITE_WHITELISTED_FLEX_OPS_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_WHITELISTED_FLEX_OPS_H_
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 33f5a86c422..0860166278b 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 # Primary purpose of this config is to replace ::util::Status with our custom
 # light implementation ::tflite::gpu::StatusLite to reduce binary size.  Besides
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index c71ec763012..38c3fec351f 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -1,23 +1,29 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "convert",
     srcs = ["convert.cc"],
     hdrs = ["convert.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common:util",
+        ":shape",
+        ":status",
+        ":tensor",
+        ":types",
+        ":util",
         "@FP16",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+cc_library(
+    name = "access_type",
+    hdrs = ["access_type.h"],
+)
+
 cc_library(
     name = "data_type",
     srcs = ["data_type.cc"],
@@ -29,7 +35,7 @@ cc_library(
     srcs = ["memory_management.cc"],
     hdrs = ["memory_management.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:status",
+        ":status",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -92,7 +98,7 @@ cc_library(
     srcs = ["model_transformer.cc"],
     hdrs = ["model_transformer.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:model",
+        ":model",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -158,7 +164,7 @@ cc_library(
     name = "util",
     hdrs = ["util.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:types",
+        ":types",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/common/access_type.h b/tensorflow/lite/delegates/gpu/common/access_type.h
new file mode 100644
index 00000000000..c54888fe2c9
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/access_type.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_ACCESS_TYPE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_ACCESS_TYPE_H_
+
+namespace tflite {
+namespace gpu {
+
+enum class AccessType {
+  READ,
+  WRITE,
+  READ_WRITE,
+};
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_ACCESS_TYPE_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.h b/tensorflow/lite/delegates/gpu/common/memory_management.h
index 147826134fa..4b8023b8d54 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.h
@@ -40,6 +40,11 @@ struct TensorUsageRecord {
 
   TensorUsageRecord(uint32_t size, TaskId first, TaskId last)
       : tensor_size(size), first_task(first), last_task(last) {}
+
+  // Default order of tensor usage records is increasing order of first_task.
+  bool operator<(const TensorUsageRecord& other) const {
+    return first_task < other.first_task;
+  }
 };
 
 // Information about assignment of tensors to shared objects
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
index 6fd8b217b67..a1484cd0e55 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
@@ -28,17 +28,17 @@ TEST(Model, EmptyRecords) {
   ObjectsAssignment assignment;
   ASSERT_TRUE(
       AssignObjectsToTensors({}, MemoryStrategy::NAIVE, &assignment).ok());
-  ASSERT_TRUE(assignment.object_ids.empty());
-  ASSERT_TRUE(assignment.object_sizes.empty());
+  EXPECT_TRUE(assignment.object_ids.empty());
+  EXPECT_TRUE(assignment.object_sizes.empty());
   ASSERT_TRUE(
       AssignObjectsToTensors({}, MemoryStrategy::GREEDY, &assignment).ok());
-  ASSERT_TRUE(assignment.object_ids.empty());
-  ASSERT_TRUE(assignment.object_sizes.empty());
+  EXPECT_TRUE(assignment.object_ids.empty());
+  EXPECT_TRUE(assignment.object_sizes.empty());
   ASSERT_TRUE(
       AssignObjectsToTensors({}, MemoryStrategy::MINCOSTFLOW, &assignment)
           .ok());
-  ASSERT_TRUE(assignment.object_ids.empty());
-  ASSERT_TRUE(assignment.object_sizes.empty());
+  EXPECT_TRUE(assignment.object_ids.empty());
+  EXPECT_TRUE(assignment.object_sizes.empty());
 }
 
 TEST(Model, OneRecord) {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/BUILD b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
index 8fa03687adc..d0411473fae 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "add_bias",
diff --git a/tensorflow/lite/delegates/gpu/gl/BUILD b/tensorflow/lite/delegates/gpu/gl/BUILD
index 409581b9a87..519f1b245be 100644
--- a/tensorflow/lite/delegates/gpu/gl/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/BUILD
@@ -66,7 +66,7 @@ flatbuffer_cc_library(
         "--scoped-enums",
     ],
     includes = [
-        "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs_includes",
+        ":common_cc_fbs_includes",
     ],
 )
 
@@ -260,8 +260,10 @@ cc_library(
     srcs = ["gl_sync.cc"],
     hdrs = ["gl_sync.h"],
     deps = [
+        ":gl_buffer",
         ":gl_call",
         ":gl_errors",
+        ":gl_program",
         ":portable",
         "//tensorflow/lite/delegates/gpu/common:status",
     ],
@@ -283,8 +285,8 @@ flatbuffer_cc_library(
     name = "metadata_cc_fbs",
     srcs = ["metadata.fbs"],
     includes = [
-        "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs_includes",
-        "//tensorflow/lite/delegates/gpu/gl:workgroups_cc_fbs_includes",
+        ":common_cc_fbs_includes",
+        ":workgroups_cc_fbs_includes",
     ],
 )
 
@@ -306,6 +308,7 @@ cc_library(
     name = "object",
     hdrs = ["object.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
@@ -431,7 +434,7 @@ flatbuffer_cc_library(
     name = "workgroups_cc_fbs",
     srcs = ["workgroups.fbs"],
     includes = [
-        "//tensorflow/lite/delegates/gpu/gl:common_cc_fbs_includes",
+        ":common_cc_fbs_includes",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/gl/api.cc b/tensorflow/lite/delegates/gpu/gl/api.cc
index 8dea7235b13..c2800bb2b59 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.cc
+++ b/tensorflow/lite/delegates/gpu/gl/api.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <deque>
 #include <mutex>  // NOLINT
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "absl/memory/memory.h"
@@ -382,6 +383,7 @@ bool IsBatchMatchesForAllValues(const GraphFloat32& model) {
 }  // namespace
 
 Status Compile(const CompilationOptions& options, const GraphFloat32& model,
+               const std::unordered_set<int>& tflite_graph_io,
                const NodeShader& node_shader,
                const WorkgroupsCalculator& workgroup_calculator,
                std::unique_ptr<CompiledModel>* compiled_model) {
@@ -393,9 +395,10 @@ Status Compile(const CompilationOptions& options, const GraphFloat32& model,
   auto compiled_model_impl = absl::make_unique<CompiledModelImpl>(gpu_info);
   compiled_model_impl->set_dynamic_batch(options.dynamic_batch);
   auto compiler = NewCompiler(&node_shader, &gpu_info, options);
-  RETURN_IF_ERROR(compiler->Compile(model, [&](ShaderCode code) -> Status {
-    return compiled_model_impl->Add(workgroup_calculator, std::move(code));
-  }));
+  RETURN_IF_ERROR(
+      compiler->Compile(model, tflite_graph_io, [&](ShaderCode code) -> Status {
+        return compiled_model_impl->Add(workgroup_calculator, std::move(code));
+      }));
   *compiled_model = std::move(compiled_model_impl);
   return OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/gl/api.h b/tensorflow/lite/delegates/gpu/gl/api.h
index 3258d162110..78b277852d0 100644
--- a/tensorflow/lite/delegates/gpu/gl/api.h
+++ b/tensorflow/lite/delegates/gpu/gl/api.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/model.h"
@@ -65,6 +66,7 @@ class CompiledModel {
 
 // Turns the given model into "compiled" form that is suitable for inference.
 Status Compile(const CompilationOptions& options, const GraphFloat32& model,
+               const std::unordered_set<int>& tflite_graph_io,
                const NodeShader& node_shader,
                const WorkgroupsCalculator& workgroup_calculator,
                std::unique_ptr<CompiledModel>* compiled_model);
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.cc b/tensorflow/lite/delegates/gpu/gl/compiler.cc
index c6d5cf5b370..12ee49d3ce7 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -101,6 +102,7 @@ class CompilerImpl : public Compiler {
   }
 
   Status Compile(const GraphFloat32& graph,
+                 const std::unordered_set<int>& tflite_graph_io,
                  const ShaderCodeCallback& callback) final {
     // It is important to have ids in a compiled graph identical to the given
     // graph.
@@ -152,7 +154,8 @@ class CompilerImpl : public Compiler {
       // External references may not be upgraded to f16 nor be represented as
       // textures.
       const bool is_external =
-          graph.IsGraphInput(value->id) || graph.IsGraphOutput(value->id);
+          graph.IsGraphInput(value->id) || graph.IsGraphOutput(value->id) ||
+          tflite_graph_io.find(value->tensor.ref) != tflite_graph_io.end();
       if (is_external) {
         object.object_type = ObjectType::BUFFER;
       } else if (options_.allow_precision_loss) {
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler.h b/tensorflow/lite/delegates/gpu/gl/compiler.h
index b0f1f452610..3b692117024 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <unordered_set>
 
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -40,6 +41,7 @@ class Compiler {
   // Callback is called for every generated shader. Callback may execute shaders
   // as they come or store them elsewhere to execute later.
   virtual Status Compile(const GraphFloat32& graph,
+                         const std::unordered_set<int>& tflite_graph_io,
                          const ShaderCodeCallback& callback) = 0;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
index da200306a76..da278f0bcd7 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
diff --git a/tensorflow/lite/delegates/gpu/gl/converters/BUILD b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
index 8f6b5618dd6..0be7e873445 100644
--- a/tensorflow/lite/delegates/gpu/gl/converters/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
index 6e5e8afa364..86c3c59639f 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
@@ -84,6 +84,54 @@ GlBuffer GlBuffer::MakeRef() {
                   /* has_ownership = */ false);
 }
 
+GlPersistentBuffer::GlPersistentBuffer(GLenum target, GLuint id,
+                                       size_t bytes_size, size_t offset,
+                                       bool has_ownership, void* data)
+    : GlBuffer(target, id, bytes_size, offset, has_ownership), data_(data) {}
+
+GlPersistentBuffer::GlPersistentBuffer()
+    : GlPersistentBuffer(GL_INVALID_ENUM, GL_INVALID_INDEX, 0, 0, false,
+                         nullptr) {}
+
+GlPersistentBuffer::GlPersistentBuffer(GlPersistentBuffer&& buffer)
+    : GlBuffer(std::move(buffer)), data_(buffer.data_) {}
+
+GlPersistentBuffer& GlPersistentBuffer::operator=(GlPersistentBuffer&& buffer) {
+  if (this != &buffer) {
+    data_ = buffer.data_;
+    GlBuffer::operator=(std::move(buffer));
+  }
+  return *this;
+}
+
+GlPersistentBuffer::~GlPersistentBuffer() {
+  if (!data_) return;
+  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id());
+  glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
+}
+
+Status CreatePersistentBuffer(size_t size, GlPersistentBuffer* gl_buffer) {
+  PFNGLBUFFERSTORAGEEXTPROC glBufferStorageEXT = nullptr;
+  glBufferStorageEXT = reinterpret_cast<PFNGLBUFFERSTORAGEEXTPROC>(
+      eglGetProcAddress("glBufferStorageEXT"));
+  if (!glBufferStorageEXT) {
+    return UnavailableError("glBufferStorageEXT is not supported");
+  }
+  gl_buffer_internal::BufferId id;
+  gl_buffer_internal::BufferBinder binder(GL_SHADER_STORAGE_BUFFER, id.id());
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(
+      glBufferStorageEXT, GL_SHADER_STORAGE_BUFFER, size, nullptr,
+      GL_MAP_COHERENT_BIT_EXT | GL_MAP_READ_BIT | GL_MAP_WRITE_BIT |
+          GL_MAP_PERSISTENT_BIT_EXT));
+  void* data = nullptr;
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_GL(
+      glMapBufferRange, &data, GL_SHADER_STORAGE_BUFFER, 0, size,
+      GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT_EXT));
+  *gl_buffer = GlPersistentBuffer{
+      GL_SHADER_STORAGE_BUFFER, id.Release(), size, 0, true, data};
+  return OkStatus();
+}
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
index 5897499598c..019022a5baa 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@@ -141,6 +141,32 @@ Status AppendFromBuffer(const GlBuffer& buffer, std::vector<T>* data) {
       absl::MakeSpan(data->data() + data->size() - num_elements, num_elements));
 }
 
+// Persistent buffer provides CPU pointer to the buffer that is valid all the
+// time. A user should properly synchronize the access to the buffer on CPU and
+// GPU sides.
+class GlPersistentBuffer : public GlBuffer {
+ public:
+  GlPersistentBuffer(GLenum target, GLuint id, size_t bytes_size, size_t offset,
+                     bool has_ownership, void* data);
+  GlPersistentBuffer();
+
+  // Move-only
+  GlPersistentBuffer(GlPersistentBuffer&& buffer);
+  GlPersistentBuffer& operator=(GlPersistentBuffer&& buffer);
+  GlPersistentBuffer(const GlPersistentBuffer&) = delete;
+  GlPersistentBuffer& operator=(const GlPersistentBuffer&) = delete;
+
+  ~GlPersistentBuffer();
+
+  void* data() { return data_; }
+
+ private:
+  void* data_;
+};
+
+// Creates read-write persistent buffer with valid CPU pointer
+Status CreatePersistentBuffer(size_t size, GlPersistentBuffer* gl_buffer);
+
 ////////////////////////////////////////////////////////////////////////////////
 // Implementation details are below.
 
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_sync.cc b/tensorflow/lite/delegates/gpu/gl/gl_sync.cc
index 889e8dda428..92caaa5c78a 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_sync.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_sync.cc
@@ -78,6 +78,46 @@ Status GlActiveSyncWait() {
   }
 }
 
+Status GlShaderSync::NewSync(GlShaderSync* gl_sync) {
+  GlShaderSync sync;
+  RETURN_IF_ERROR(CreatePersistentBuffer(sizeof(int), &sync.flag_buffer_));
+  static const std::string* kCode = new std::string(R"(#version 310 es
+  layout(local_size_x = 1, local_size_y = 1) in;
+  layout(std430) buffer;
+  layout(binding = 0) buffer Output {
+    int elements[];
+  } output_data;
+  void main() {
+    output_data.elements[0] = 1;
+  })");
+  GlShader shader;
+  RETURN_IF_ERROR(GlShader::CompileShader(GL_COMPUTE_SHADER, *kCode, &shader));
+  RETURN_IF_ERROR(GlProgram::CreateWithShader(shader, &sync.flag_program_));
+  *gl_sync = std::move(sync);
+  return OkStatus();
+}
+
+// How it works: GPU writes a buffer and CPU checks the buffer value to be
+// changed. The buffer is accessible for writing by GPU and reading by CPU
+// simultaneously - persistent buffer or buffer across shild context can be used
+// for that.
+Status GlShaderSync::Wait() {
+  if (!flag_buffer_.is_valid()) {
+    return UnavailableError("GlShaderSync is not initialized.");
+  }
+  RETURN_IF_ERROR(flag_buffer_.BindToIndex(0));
+  volatile int* flag_ptr_ = reinterpret_cast<int*>(flag_buffer_.data());
+  *flag_ptr_ = 0;
+  RETURN_IF_ERROR(flag_program_.Dispatch({1, 1, 1}));
+  // glFlush must be called to upload GPU task. Adreno won't start executing
+  // the task without glFlush.
+  glFlush();
+  // Wait for the value is being updated by the shader.
+  while (*flag_ptr_ != 1) {
+  }
+  return OkStatus();
+}
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_sync.h b/tensorflow/lite/delegates/gpu/gl/gl_sync.h
index a00a0c2b048..4f89e01abed 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_sync.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_sync.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_GL_SYNC_H_
 
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_buffer.h"
 #include "tensorflow/lite/delegates/gpu/gl/gl_call.h"
+#include "tensorflow/lite/delegates/gpu/gl/gl_program.h"
 #include "tensorflow/lite/delegates/gpu/gl/portable_gl31.h"
 
 namespace tflite {
@@ -75,10 +77,26 @@ class GlSync {
 // Waits until GPU is done with processing.
 Status GlSyncWait();
 
-// Performs active waiting by spinning a thread and checking sync status. It
-// leads to shorter wait time (up to tens of ms) but consumes more CPU.
+// Waits until all comands are flushed and then performs active waiting by
+// spinning a thread and checking sync status. It leads to shorter wait time
+// (up to tens of ms) but consumes more CPU.
 Status GlActiveSyncWait();
 
+// CPU checks the value in the buffer that is going to be written by GPU. The
+// persistent buffer is used for the simultaneous access to the buffer by GPU
+// and CPU. The instance remains invalid if persistent buffer OpenGL extension
+// is not supported by the device.
+class GlShaderSync {
+ public:
+  static Status NewSync(GlShaderSync* gl_sync);
+  GlShaderSync() {}
+  Status Wait();
+
+ private:
+  GlProgram flag_program_;
+  GlPersistentBuffer flag_buffer_;
+};
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 21ebf6cc744..d5e2be67b88 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
index aac3823f2f2..291c423fe0b 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc
@@ -107,19 +107,22 @@ Status GenerateAveragePoolingCode(const Pooling2DAttributes& attr,
       {"offset", int2(attr.padding.prepended.w, attr.padding.prepended.h)},
       {"window_h", attr.kernel.h},
       {"window_w", attr.kernel.w},
-      {"multiplier", 1.0f / static_cast<float>(attr.kernel.h * attr.kernel.w)},
   };
 
   std::string source = R"(
+  int window_size = 0;
   for (int a = 0; a < $window_h$; ++a) {
     for (int b = 0; b < $window_w$; ++b) {
       ivec2 coord = gid.xy * $stride$ - $offset$ + ivec2(b, a);
       if (coord.x >= 0 && coord.y >= 0 && coord.x < $input_data_0_w$ && coord.y < $input_data_0_h$) {
         value_0 += $input_data_0[coord.x, coord.y, gid.z]$;
+        window_size++;
       }
     }
   }
-  value_0 *= $multiplier$;
+  // If window_size==0, window covered nothing. This situation is a sign of
+  // incorrectly constructed operation. NaNs are expected as output.
+  value_0 /= float(window_size);
 )";
   *generated_code = {
       /*parameters=*/std::move(parameters),
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc b/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
index 53be4fc4df7..71df8c544bd 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/test_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/model.h"
@@ -100,9 +101,9 @@ Status SingleOpModel::InvokeInternal(const CompilationOptions& compile_options,
   GpuInfo gpu_info;
   RETURN_IF_ERROR(RequestGpuInfo(&gpu_info));
   std::unique_ptr<CompiledModel> compiled_model;
-  RETURN_IF_ERROR(Compile(compile_options, graph_, shader,
-                          *NewDefaultWorkgroupsCalculator(gpu_info),
-                          &compiled_model));
+  RETURN_IF_ERROR(Compile(
+      compile_options, graph_, /*tflite_graph_io=*/std::unordered_set<int>(),
+      shader, *NewDefaultWorkgroupsCalculator(gpu_info), &compiled_model));
 
   // Get inference context.
   auto command_queue = NewCommandQueue(gpu_info);
diff --git a/tensorflow/lite/delegates/gpu/gl/object.h b/tensorflow/lite/delegates/gpu/gl/object.h
index 15c83d690a3..75e9443764d 100644
--- a/tensorflow/lite/delegates/gpu/gl/object.h
+++ b/tensorflow/lite/delegates/gpu/gl/object.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
@@ -32,12 +33,6 @@ namespace tflite {
 namespace gpu {
 namespace gl {
 
-enum class AccessType {
-  READ,
-  WRITE,
-  READ_WRITE,
-};
-
 using ObjectData = std::vector<uint8_t>;
 
 // Generic identifier to be used to lookup an object.
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime/BUILD b/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
index 98d03f77adb..20b307359db 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/runtime/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow/lite/delegates/gpu/gl:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow/lite/delegates/gpu/gl:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "shared_buffer",
diff --git a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
index 101852058a7..28a172b35de 100644
--- a/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/workgroups/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "calculator",
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index 553883154ae..8fd58e4457d 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include <EGL/egl.h>
@@ -67,7 +68,9 @@ TfLiteStatus DelegateCopyToBufferHandle(
     TfLiteBufferHandle buffer_handle,  // ValueId
     TfLiteTensor* tensor);
 
-inline bool IsPHWC4(const BHWC& shape) { return shape.c == 4; }
+inline bool IsPHWC4(const BHWC& shape) {
+  return shape.c == 4 || (shape.h == 1 && shape.w == 1 && shape.c % 4 == 0);
+}
 
 class Delegate {
   struct ValueRef {
@@ -159,6 +162,8 @@ class Delegate {
       tensors_[value->id] = {value->tensor.shape, 0};
     }
 
+    std::unordered_set<int> tflite_graph_io;
+
     // Prepare graph inputs.
     //
     // Note that graph.inputs() cannot be used directly, as the notion of
@@ -171,6 +176,7 @@ class Delegate {
         if (tensor->allocation_type == TfLiteAllocationType::kTfLiteMmapRo) {
           continue;
         }
+        tflite_graph_io.insert(tensor_index);
         const auto* input = find_value(tensor_index);
         if (!input || tensor->type != TfLiteType::kTfLiteFloat32) {
           return NotFoundError("Input tensor is not found in the graph.");
@@ -208,6 +214,7 @@ class Delegate {
       for (int i = 0; i < delegate_params->output_tensors->size; ++i) {
         const int tensor_index = delegate_params->output_tensors->data[i];
         auto* tensor = context->tensors + tensor_index;
+        tflite_graph_io.insert(tensor_index);
         const auto* output = find_value(tensor_index);
         if (!output || tensor->type != TfLiteType::kTfLiteFloat32) {
           return NotFoundError("Output tensor is not found in the graph.");
@@ -255,7 +262,7 @@ class Delegate {
     auto workgroups_calculator =
         BestEffortWorkgroupsCalculator(options_.metadata, gpu_info);
     std::unique_ptr<CompiledModel> compiled_model;
-    RETURN_IF_ERROR(Compile(compile_options, graph, *shaders,
+    RETURN_IF_ERROR(Compile(compile_options, graph, tflite_graph_io, *shaders,
                             *workgroups_calculator, &compiled_model));
 
     // Create inference context.
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
index 7f8162275f3..b1731e7c1c4 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
@@ -2,12 +2,13 @@
 # Java Native Interface (JNI) library intended for implementing the
 # TensorFlow Lite GPU delegate Java API using the TensorFlow Lite CC library.
 
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
-licenses(["notice"])  # Apache 2.0
-
 cc_library(
     name = "native",
     srcs = ["gpu_delegate_jni.cc"],
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 855107bacdb..5c16a5fe227 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 DEFAULT_COPTS = [
     "-std=c++11",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
index 5e876228bd3..01d6a4ea4a0 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
@@ -148,7 +148,6 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
   using namespace metal;
   constant int window_w = $0;
   constant int window_h = $1;
-  constant float multiplier = $2;
   struct uniforms {
     int4 src_size;
     int4 dst_size;
@@ -167,6 +166,7 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
     }
 
     float4 sum = float4(0.0f);
+    float window_size = 0.0f;
     for (int a = 0; a < window_h; ++a) {
       for (int b = 0; b < window_w; ++b) {
         const int2 coords = int2(gid.xy) * params.stride - params.offset + int2(b, a);
@@ -175,19 +175,20 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
         const int buffer_index = (gid.z * params.src_size.y + coords.y) *
           params.src_size.x + coords.x;
         const float4 src_color = outside ? float4(0.0f) : float4(src_buffer[buffer_index]);
+        window_size += outside ? 0.0f : 1.0f;
         sum += src_color;
       }
     }
     const int linear_index = (gid.z * params.dst_size.y + int(gid.y)) * params.dst_size.x +
       int(gid.x);
-    FLT4 value = FLT4(sum * multiplier);
+    // If window_size==0, window covered nothing. This situation is a sign of
+    // incorrectly constructed operation. NaNs are expected as output.
+    FLT4 value = FLT4(sum / window_size);
     $$2
     output_buffer[linear_index] = value;
   }
 )";
-  float multiplier = 1.0f / static_cast<float>(kernel_size.w * kernel_size.h);
-  return absl::Substitute(shader_source, kernel_size.w, kernel_size.h,
-                          multiplier);
+  return absl::Substitute(shader_source, kernel_size.w, kernel_size.h);
 }
 
 ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 6cb358702eb..bd15bbc84d9 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -1,11 +1,12 @@
-package(default_visibility = [
-    "//visibility:public",
-])
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
-licenses(["notice"])  # Apache 2.0
-
 cc_library(
     name = "nnapi_delegate",
     srcs = select({
@@ -22,6 +23,8 @@ cc_library(
     hdrs = ["nnapi_delegate.h"],
     deps = [
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
@@ -34,8 +37,6 @@ cc_test(
     srcs = ["nnapi_delegate_test.cc"],
     tags = [
         "no_windows",
-        # TODO(b/122987564): Enable on Android after resolving API 27 failures.
-        "tflite_not_portable_android",
         "tflite_not_portable_ios",
     ],
     deps = [
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD
index 17a238980d0..c64e0971475 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 filegroup(
     name = "nnapi_delegate_src",
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
index 78bf59d6f72..3e680162452 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
@@ -16,7 +16,6 @@ limitations under the License.
 package org.tensorflow.lite.nnapi;
 
 import org.tensorflow.lite.Delegate;
-import org.tensorflow.lite.Tensor;
 
 /** {@link Delegate} for NNAPI inference. */
 public class NnApiDelegate implements Delegate, AutoCloseable {
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD b/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
index 4c12ef344d5..88490d570f9 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/BUILD
@@ -2,12 +2,13 @@
 # Java Native Interface (JNI) library intended for implementing the
 # TensorFlow Lite GPU delegate Java API using the TensorFlow Lite CC library.
 
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
-licenses(["notice"])  # Apache 2.0
-
 cc_library(
     name = "native",
     srcs = ["nnapi_delegate_jni.cc"],
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 5cb02e2ae54..f91767f3cf4 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -15,9 +15,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 
 #include <cstdarg>
+#include <cstdint>
 #include <cstring>
+#include <functional>
 #include <iostream>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "tensorflow/lite/builtin_op_data.h"
@@ -25,7 +28,9 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
+#include "tensorflow/lite/util.h"
 
 #ifdef __ANDROID__
 #include <sys/system_properties.h>
@@ -138,6 +143,38 @@ static size_t getNumPaddingBytes(size_t byte_size) {
   }
   return num_padding_bytes;
 }
+
+// Return NNAPI device handle with the provided null-terminated device name. If
+// no matching device could be found, nullptr will be returned.
+ANeuralNetworksDevice* GetDeviceHandle(const char* device_name_ptr) {
+  if (!device_name_ptr) return nullptr;
+  ANeuralNetworksDevice* device_handle = nullptr;
+  std::string device_name(device_name_ptr);
+  uint32_t numDevices = 0;
+  NnApiImplementation()->ANeuralNetworks_getDeviceCount(&numDevices);
+
+  for (uint32_t i = 0; i < numDevices; i++) {
+    ANeuralNetworksDevice* device = nullptr;
+    const char* buffer = nullptr;
+    NnApiImplementation()->ANeuralNetworks_getDevice(i, &device);
+    NnApiImplementation()->ANeuralNetworksDevice_getName(device, &buffer);
+    if (device_name == buffer) {
+      device_handle = device;
+      break;
+    }
+  }
+  return device_handle;
+}
+
+// Compute the hash of a TfLiteIntArray.
+uint64_t GetHash(const TfLiteIntArray* int_array) {
+  constexpr auto kHashConst = 0x9e3779b97f4a7800ULL;
+  uint64_t result = 0;
+  for (auto i : TfLiteIntArrayView(int_array)) {
+    result = result ^ (i + kHashConst + (result << 10) + (result >> 4));
+  }
+  return result;
+}
 }  // namespace
 
 // RAII NN API Model Destructor for use with std::unique_ptr
@@ -230,6 +267,24 @@ class OperandMapping {
     return new_tensor_index;
   }
 
+  // Given a TFLite index returns a TFLite type to which a tensor must be
+  // converted during copying the data to the memory allocated for NN API.
+  // kTfLiteNoType means no conversion is needed.
+  TfLiteType lite_index_to_ann_type_conversion(int index) const {
+    if (index >= 0 && index < index_to_type_conversion_.size())
+      return index_to_type_conversion_[index];
+    else
+      return kTfLiteNoType;
+  }
+
+  // Add a new mapping from TFLite index to a type conversion.
+  void add_type_conversion(int tflite_index, TfLiteType tflite_type) {
+    if (tflite_index >= index_to_type_conversion_.size()) {
+      index_to_type_conversion_.resize(tflite_index + 1, kTfLiteNoType);
+    }
+    index_to_type_conversion_[tflite_index] = tflite_type;
+  }
+
  private:
   // Next index of ann tensor
   int next_ann_tensor_index_ = 0;
@@ -237,6 +292,11 @@ class OperandMapping {
   // Mapping from lite index. Use a std::vector for speed and code size
   // rather than a map.
   std::vector<int> lite_tensor_to_ann_tensor_;
+  // Mapping from lite index to a type which tensor must be converted to during
+  // the copying of the data to the memory allocated for NN API. kTfLiteNoType
+  // means no conversion is needed. Use an std::vector for speed and code size
+  // rather than a map.
+  std::vector<TfLiteType> index_to_type_conversion_;
 };
 
 class DequantizeMapping {
@@ -277,6 +337,10 @@ class NNAPIOpBuilder {
         dequantize_mapping_(dequantize_mapping),
         nn_model_(nn_model) {}
 
+  TfLiteStatus AddScalarBoolOperand(bool value) {
+    return AddScalarOperand<bool>(value, ANEURALNETWORKS_BOOL);
+  }
+
   TfLiteStatus AddScalarInt32Operand(int32_t value) {
     return AddScalarOperand<int32_t>(value, ANEURALNETWORKS_INT32);
   }
@@ -386,7 +450,55 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
+  TfLiteStatus AddSingleValueTensorAsScalarOperand(int tensor_index,
+                                                   int nn_type) {
+    const TfLiteTensor* tensor = &context_->tensors[tensor_index];
+    TF_LITE_ENSURE_EQ(context_, NumElements(tensor), 1);
+
+    ANeuralNetworksOperandType operand_type{.type = nn_type};
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
+    if (ann_tensor_index != -1) {
+      augmented_inputs_.push_back(ann_tensor_index);
+      return kTfLiteOk;
+    }
+    // Allocate a new tensor index
+    ann_tensor_index = operand_mapping_->add_new_ann_tensor_index(tensor_index);
+    augmented_inputs_.push_back(ann_tensor_index);
+
+    const TfLiteType tensor_type = tensor->type;
+    TfLiteType nn_type_equivalent;
+    TF_LITE_ENSURE_OK(context_, GetEquivalentToANNType(context_, nn_type,
+                                                       &nn_type_equivalent));
+    if (tensor_type != nn_type_equivalent) {
+      operand_mapping_->add_type_conversion(tensor_index, nn_type_equivalent);
+    }
+    return kTfLiteOk;
+  }
+
  private:
+  // Returns a TF Lite type which has the same memory representation as a
+  // provided NN API type.
+  TfLiteStatus GetEquivalentToANNType(TfLiteContext* context, int nn_type,
+                                      TfLiteType* type) {
+    switch (nn_type) {
+      case ANEURALNETWORKS_INT32:
+        *type = kTfLiteInt32;
+        return kTfLiteOk;
+      case ANEURALNETWORKS_FLOAT32:
+        *type = kTfLiteFloat32;
+        return kTfLiteOk;
+      default:
+        context->ReportError(context,
+                             "NN API Delegate: Can't get an equivalent TF Lite "
+                             "type for provided NN API type: %d.\n",
+                             nn_type);
+        return kTfLiteError;
+    }
+  }
+
   template <typename T>
   TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
     ANeuralNetworksOperandType operand_type{.type = nn_type};
@@ -642,11 +754,35 @@ class NNAPIDelegateKernel {
           }
           auto builtin =
               reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-          if (builtin->dilation_width_factor != 1 ||
-              builtin->dilation_height_factor != 1 || node->inputs->size != 3) {
-            // NNAPI does not support dilated Conv2D.
+          if (node->inputs->size != 3) {
+            // TODO(b/132950584): Add support for Conv2D with omitted bias
             return nullptr;
           }
+          // NNAPI supports dilated Conv2D since NNAPI 1.2.
+          if (builtin->dilation_width_factor != 1 ||
+              builtin->dilation_height_factor != 1) {
+            if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+              return nullptr;
+            }
+            return [](const NNAPIOpMappingArgs& mapping_args)
+                       -> ANeuralNetworksOperationType {
+              auto builtin = reinterpret_cast<TfLiteConvParams*>(
+                  mapping_args.node->builtin_data);
+              mapping_args.builder->AddScalarInt32Operand(builtin->padding);
+              mapping_args.builder->AddScalarInt32Operand(
+                  builtin->stride_width);
+              mapping_args.builder->AddScalarInt32Operand(
+                  builtin->stride_height);
+              mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+              mapping_args.builder->AddScalarBoolOperand(
+                  false);  // Use NHWC format
+              mapping_args.builder->AddScalarInt32Operand(
+                  builtin->dilation_width_factor);
+              mapping_args.builder->AddScalarInt32Operand(
+                  builtin->dilation_height_factor);
+              return ANEURALNETWORKS_CONV_2D;
+            };
+          }
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteConvParams*>(
@@ -909,20 +1045,30 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinPad:
-        if (version == 1 && node->inputs->size == 2 &&
-            (android_sdk_version >= kMinSdkVersionForNNAPI11) &&
-            (context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 ||
-             (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8 &&
-              context->tensors[node->inputs->data[0]].params.zero_point == 0) ||
-             android_sdk_version >= kMinSdkVersionForNNAPI12)) {
-          // NNAPI does not support specifying the padding value.
-          // Before 1.2, NNAPI pads physical zero for quantized tensors, so only
-          // delegate pad with float input or quantized input with zero_point ==
-          // 0 to NNAPI. NNAPI 1.2 onwards pads with zero-point, so delegate
-          // other quantized pad as well.
-          return BasicMappingFn<ANEURALNETWORKS_PAD>;
+      case kTfLiteBuiltinPadv2: {
+        const TfLiteType input_type =
+            context->tensors[node->inputs->data[0]].type;
+        if (version == 1 &&
+            (input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8)) {
+          if (node->inputs->size == 2 &&
+              android_sdk_version >= kMinSdkVersionForNNAPI11 &&
+              (context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 ||
+               android_sdk_version >= kMinSdkVersionForNNAPI12)) {
+            // NNAPI does not support specifying the padding value.
+            // Before 1.2, NNAPI pads physical zero for quantized tensors,
+            // so only delegate float pad to NNAPI. NNAPI 1.2 onwards pads
+            // with zero-point, so delegate quantized pad as well.
+            return BasicMappingFn<ANEURALNETWORKS_PAD>;
+          } else if (node->inputs->size == 3 &&
+                     android_sdk_version >= kMinSdkVersionForNNAPI12) {
+            const int constant_value_id = node->inputs->data[2];
+            if (constant_value_id == kOptionalTensor) {
+              return BasicMappingFn<ANEURALNETWORKS_PAD>;
+            }
+            return BasicMappingFn<ANEURALNETWORKS_PAD_V2>;
+          }
         }
-        break;
+      } break;
       case kTfLiteBuiltinSpaceToBatchNd:
         if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return BasicMappingFn<ANEURALNETWORKS_SPACE_TO_BATCH_ND>;
@@ -1161,6 +1307,21 @@ class NNAPIDelegateKernel {
       nodes_.push_back(node_index);
     }
 
+    const auto delegate_options =
+        StatefulNnApiDelegate::GetOptions(params->delegate);
+    const char* device_name_ptr = delegate_options.accelerator_name;
+    // user specified an acclelerator to use.
+    if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+        device_name_ptr != nullptr) {
+      nnapi_device_ = GetDeviceHandle(device_name_ptr);
+      if (nnapi_device_ == nullptr) {
+        context->ReportError(context,
+                             "Could not find the specified accelerator: %s.",
+                             device_name_ptr);
+        return kTfLiteError;
+      }
+    }
+
     if (!nn_model_) {
       ANeuralNetworksModel* model = nullptr;
       RETURN_TFLITE_ERROR_IF_NN_ERROR(
@@ -1173,12 +1334,18 @@ class NNAPIDelegateKernel {
 
     if (!nn_compilation_) {
       ANeuralNetworksCompilation* compilation = nullptr;
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context, nnapi_->ANeuralNetworksCompilation_create(nn_model_.get(),
-                                                             &compilation));
+      if (nnapi_device_ != nullptr) {
+        // Compile for the selected accelerator.
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+            context, nnapi_->ANeuralNetworksCompilation_createForDevices(
+                         nn_model_.get(), &nnapi_device_, 1, &compilation));
+      } else {
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+            context, nnapi_->ANeuralNetworksCompilation_create(nn_model_.get(),
+                                                               &compilation));
+      }
 
-      auto preference = StatefulNnApiDelegate::GetOptions(params->delegate)
-                            .execution_preference;
+      auto preference = delegate_options.execution_preference;
       if (preference !=
           StatefulNnApiDelegate::Options::ExecutionPreference::kUndefined) {
         const int preference_result =
@@ -1191,6 +1358,37 @@ class NNAPIDelegateKernel {
         RETURN_TFLITE_ERROR_IF_NN_ERROR(context, preference_result);
       }
 
+      const char* cache_dir = delegate_options.cache_dir;
+      const char* model_token = delegate_options.model_token;
+      if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI12 &&
+          cache_dir && model_token) {
+        // Compilation caching could be enabled, try construct the uint8 token.
+        // TODO(133342794): use a generic token generator class.
+        uint64_t token_parts[4];
+        // bits from model_token.
+        token_parts[0] = std::hash<std::string>{}(model_token);
+        // bits from params->nodes_to_replace.
+        token_parts[1] = GetHash(params->nodes_to_replace);
+        // bits from params->input_tensors.
+        token_parts[2] = GetHash(params->input_tensors);
+        // bits from params->output_tensors.
+        token_parts[3] = GetHash(params->output_tensors);
+        // NNAPI requires the token to be 256bit long.
+        std::vector<uint8_t> nnapi_cache_token(32, 0);
+        // Copy the token bits.
+        uint8_t* p = reinterpret_cast<uint8_t*>(token_parts);
+        for (int i = 0; i < 4 * sizeof(uint64_t); i++) {
+          nnapi_cache_token[i] = p[i];
+        }
+        const int set_caching_result =
+            nnapi_->ANeuralNetworksCompilation_setCaching(
+                compilation, cache_dir, nnapi_cache_token.data());
+        if (set_caching_result != ANEURALNETWORKS_NO_ERROR) {
+          nnapi_->ANeuralNetworksCompilation_free(compilation);
+          compilation = nullptr;
+        }
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result);
+      }
       const int finish_result =
           nnapi_->ANeuralNetworksCompilation_finish(compilation);
       if (finish_result != ANEURALNETWORKS_NO_ERROR) {
@@ -1224,16 +1422,48 @@ class NNAPIDelegateKernel {
       // TODO(miaowang): make sure the delegation works with dequantized weights
       // as intermediate tensors.
       if (tensor->allocation_type != kTfLiteMmapRo) {
-        // copy data to pre-allocated shared memory.
-        memcpy(nn_input_memory_->get_data_ptr() + input_offset,
-               tensor->data.raw, tensor->bytes);
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
-            context,
-            nnapi_->ANeuralNetworksExecution_setInputFromMemory(
-                execution, relative_input_index, nullptr,
-                nn_input_memory_->get_handle(), input_offset, tensor->bytes));
-        input_offset += tensor->bytes;
-        input_offset += getNumPaddingBytes(tensor->bytes);
+        TfLiteType ann_type_equivalent =
+            operand_mapping_.lite_index_to_ann_type_conversion(
+                absolute_input_index);
+        int tensor_size = 0;
+        if (ann_type_equivalent != kTfLiteNoType) {
+          if (tensor->type == kTfLiteUInt8 &&
+              ann_type_equivalent == kTfLiteInt32) {
+            for (int i = 0; i < NumElements(tensor); ++i) {
+              reinterpret_cast<int32_t*>(nn_input_memory_->get_data_ptr() +
+                                         input_offset)[i] =
+                  static_cast<const int32_t>(tensor->data.raw_const[i]);
+            }
+          } else {
+            context->ReportError(
+                context,
+                "NN API Delegate: unsupported tensor types conversion: "
+                "from type code %d to type code %d.\n",
+                tensor->type, ann_type_equivalent);
+            return kTfLiteError;
+          }
+          size_t type_size;
+          TF_LITE_ENSURE_OK(
+              context, GetSizeOfType(context, ann_type_equivalent, &type_size));
+          tensor_size = NumElements(tensor) * type_size;
+          RETURN_TFLITE_ERROR_IF_NN_ERROR(
+              context,
+              nnapi_->ANeuralNetworksExecution_setInputFromMemory(
+                  execution, relative_input_index, nullptr,
+                  nn_input_memory_->get_handle(), input_offset, tensor_size));
+        } else {
+          // copy data to pre-allocated shared memory.
+          memcpy(nn_input_memory_->get_data_ptr() + input_offset,
+                 tensor->data.raw, tensor->bytes);
+          RETURN_TFLITE_ERROR_IF_NN_ERROR(
+              context,
+              nnapi_->ANeuralNetworksExecution_setInputFromMemory(
+                  execution, relative_input_index, nullptr,
+                  nn_input_memory_->get_handle(), input_offset, tensor->bytes));
+          tensor_size = tensor->bytes;
+        }
+        input_offset += tensor_size;
+        input_offset += getNumPaddingBytes(tensor_size);
         relative_input_index++;
       }
     }
@@ -1298,6 +1528,8 @@ class NNAPIDelegateKernel {
  private:
   // Access to NNApi.
   const NnApi* nnapi_;
+  // ANN device handle.
+  ANeuralNetworksDevice* nnapi_device_ = nullptr;
   // ANN API state.
   std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
   std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
@@ -1396,6 +1628,45 @@ class NNAPIDelegateKernel {
           // NNAPI.
           continue;
         }
+        // Pad and Padv2 have an optional parameter for a pad value which has to
+        // be converted to a scalar type in NN API.
+        if ((reg->builtin_code == kTfLiteBuiltinPadv2 ||
+             reg->builtin_code == kTfLiteBuiltinPad) &&
+            node->inputs->size == 3 && num_added_inputs == 2) {
+          const int constant_value_id = node->inputs->data[2];
+          if (constant_value_id == kOptionalTensor) {
+            continue;
+          }
+          const TfLiteTensor constant_value =
+              context->tensors[constant_value_id];
+
+          switch (constant_value.type) {
+            case kTfLiteFloat32:
+              if (constant_value.allocation_type == kTfLiteMmapRo) {
+                builder.AddScalarFloat32Operand(*constant_value.data.f);
+              } else {
+                builder.AddSingleValueTensorAsScalarOperand(
+                    constant_value_id, ANEURALNETWORKS_FLOAT32);
+              }
+              break;
+            case kTfLiteUInt8:
+              if (constant_value.allocation_type == kTfLiteMmapRo) {
+                builder.AddScalarInt32Operand(
+                    static_cast<int32_t>(*constant_value.data.uint8));
+              } else {
+                builder.AddSingleValueTensorAsScalarOperand(
+                    constant_value_id, ANEURALNETWORKS_INT32);
+              }
+              break;
+            default:
+              context->ReportError(
+                  context, "Unsupported type of pad value for pad_v2\n");
+              return kTfLiteError;
+          }
+          ++num_added_inputs;
+          continue;
+        }
+
         if (input_index == kOptionalTensor &&
             (reg->builtin_code == kTfLiteBuiltinLstm ||
              reg->builtin_code == kTfLiteBuiltinSvdf)) {
@@ -1457,8 +1728,19 @@ class NNAPIDelegateKernel {
       if (i != kOptionalTensor &&
           context->tensors[i].allocation_type != kTfLiteMmapRo) {
         inputs.push_back(operand_mapping_.lite_index_to_ann(i));
-        total_input_byte_size += context->tensors[i].bytes;
-        total_input_byte_size += getNumPaddingBytes(context->tensors[i].bytes);
+        const TfLiteType nn_type_conversion =
+            operand_mapping_.lite_index_to_ann_type_conversion(i);
+        int tensor_size = 0;
+        if (nn_type_conversion == kTfLiteNoType) {
+          tensor_size = context->tensors[i].bytes;
+        } else {
+          size_t type_size;
+          TF_LITE_ENSURE_OK(
+              context, GetSizeOfType(context, nn_type_conversion, &type_size));
+          tensor_size = NumElements(&context->tensors[i]) * type_size;
+        }
+        total_input_byte_size += tensor_size;
+        total_input_byte_size += getNumPaddingBytes(tensor_size);
       }
     }
 
@@ -1506,7 +1788,19 @@ class NNAPIDelegateKernel {
 
 StatefulNnApiDelegate::StatefulNnApiDelegate(Options options)
     : TfLiteDelegate(TfLiteDelegateCreate()),
-      delegate_data_(Data{.options = options}) {
+      delegate_data_(
+          Data{.execution_preference = options.execution_preference}) {
+  if (options.accelerator_name) {
+    delegate_data_.accelerator_name = options.accelerator_name;
+  }
+  if (options.cache_dir) {
+    delegate_data_.cache_dir = options.cache_dir;
+  }
+  if (options.model_token) {
+    delegate_data_.model_token = options.model_token;
+  }
+  TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
+                       "Created TensorFlow Lite delegate for NNAPI.");
   Prepare = DoPrepare;
   data_ = &delegate_data_;
 }
@@ -1514,10 +1808,21 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options options)
 StatefulNnApiDelegate::StatefulNnApiDelegate()
     : StatefulNnApiDelegate(Options()) {}
 
-const StatefulNnApiDelegate::Options& StatefulNnApiDelegate::GetOptions(
+const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
     TfLiteDelegate* delegate) {
   auto delegate_data = reinterpret_cast<Data*>(delegate->data_);
-  return delegate_data->options;
+  StatefulNnApiDelegate::Options options;
+  options.execution_preference = delegate_data->execution_preference;
+  options.accelerator_name = delegate_data->accelerator_name.empty()
+                                 ? nullptr
+                                 : delegate_data->accelerator_name.c_str();
+  options.cache_dir = delegate_data->cache_dir.empty()
+                          ? nullptr
+                          : delegate_data->cache_dir.c_str();
+  options.model_token = delegate_data->model_token.empty()
+                            ? nullptr
+                            : delegate_data->model_token.c_str();
+  return options;
 }
 
 TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
@@ -1539,6 +1844,15 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
     if (device_count <= 1) {
       return kTfLiteOk;
     }
+    // Check if user specified an acclelerator to use.
+    const char* device_name_ptr = GetOptions(delegate).accelerator_name;
+    if (device_name_ptr && !GetDeviceHandle(device_name_ptr)) {
+      // If the selected accelerator cannot be found, NNAPI will not be used.
+      context->ReportError(context,
+                           "Could not find the specified accelerator: %s.",
+                           device_name_ptr);
+      return kTfLiteOk;
+    }
   }
   // Allocate one element in vector already since TensorFlow Lite uses
   // the first value as the number of nodes. The actual value will be set
@@ -1566,6 +1880,11 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
   // First element in vector must be the number of actual nodes.
   supported_nodes[0] = supported_nodes.size() - 1;
 
+  // If there are no delegated nodes, short-circuit node replacement.
+  if (!supported_nodes[0]) {
+    return kTfLiteOk;
+  }
+
   // NN API Delegate Registration (the pseudo kernel that will invoke NN
   // API node sub sets)
   static const TfLiteRegistration nnapi_delegate_kernel = {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 9981e384ae8..71af3bd93e7 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -39,6 +39,27 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
 
     // Preferred Power/perf trade-off.
     ExecutionPreference execution_preference = kUndefined;
+
+    // Selected NNAPI accelerator with nul-terminated name.
+    // Default to nullptr, which implies the NNAPI default behavior: NNAPI
+    // runtime is allowed to use all available accelerators. If the selected
+    // accelerator cannot be found, NNAPI will not be used.
+    // It is the caller's responsibility to ensure the string is valid for the
+    // duration of the Options object lifetime.
+    const char* accelerator_name = nullptr;
+
+    // The nul-terminated cache dir for NNAPI model.
+    // Default to nullptr, which implies the NNAPI will not try caching the
+    // compilation.
+    const char* cache_dir = nullptr;
+
+    // The unique nul-terminated token string for NNAPI model.
+    // Default to nullptr, which implies the NNAPI will not try caching the
+    // compilation. It is the caller's responsibility to ensure there is no
+    // clash of the tokens.
+    // NOTE: when using compilation caching, it is not recommended to use the
+    // same delegate instance for multiple models.
+    const char* model_token = nullptr;
   };
 
   // Uses default options.
@@ -50,13 +71,19 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
   ~StatefulNnApiDelegate() = default;
 
   // Returns the delegate options.
-  static const Options& GetOptions(TfLiteDelegate* delegate);
+  static const Options GetOptions(TfLiteDelegate* delegate);
 
  private:
   // Encapsulates all delegate data.
   struct Data {
-    // Delegate options to use.
-    Options options;
+    // Preferred Power/perf trade-off.
+    Options::ExecutionPreference execution_preference;
+    // Selected NNAPI accelerator name.
+    std::string accelerator_name;
+    // The cache dir for NNAPI model.
+    std::string cache_dir;
+    // The unique token string for NNAPI model.
+    std::string model_token;
   };
 
   // Implements TfLiteDelegate::Prepare. Please refer to TFLiteDelegate
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
index 1eb783af179..5cf365a805d 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
@@ -19,17 +19,21 @@ namespace tflite {
 
 // Return a non-functional NN API Delegate struct.
 TfLiteDelegate* NnApiDelegate() {
-  static TfLiteDelegate delegate = [] {
-    TfLiteDelegate delegate = TfLiteDelegateCreate();
-    delegate.Prepare = [](TfLiteContext* context,
-                          TfLiteDelegate* delegate) -> TfLiteStatus {
-      // Silently succeed without modifying the graph.
-      return kTfLiteOk;
-    };
-    return delegate;
-  }();
+  static StatefulNnApiDelegate* delegate = new StatefulNnApiDelegate();
+  return delegate;
+}
 
-  return &delegate;
+StatefulNnApiDelegate::StatefulNnApiDelegate(Options /* options */)
+    : StatefulNnApiDelegate() {}
+
+StatefulNnApiDelegate::StatefulNnApiDelegate()
+    : TfLiteDelegate(TfLiteDelegateCreate()) {
+  Prepare = DoPrepare;
+}
+
+TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* /* context */,
+                                              TfLiteDelegate* /* delegate */) {
+  return kTfLiteOk;
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index e8c71e0a0d9..8bb0e71ddba 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -217,6 +217,41 @@ TEST(NNAPIDelegate, StatefulDelegate) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
 }
 
+// Sanity check for the state-ful NNAPI delegate with accelerator_name
+// specified.
+TEST(NNAPIDelegate, StatefulDelegateWithAcceleratorName) {
+  StatefulNnApiDelegate::Options options;
+  options.execution_preference =
+      StatefulNnApiDelegate::Options::ExecutionPreference::kLowPower;
+  options.accelerator_name = "nnapi-reference";
+
+  FloatAddOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
+// Sanity check for the state-ful NNAPI delegate with compilation caching
+// enabled.
+TEST(NNAPIDelegate, StatefulDelegateWithCompilationCaching) {
+  StatefulNnApiDelegate::Options options;
+  options.execution_preference =
+      StatefulNnApiDelegate::Options::ExecutionPreference::kLowPower;
+  options.cache_dir = "/data/local/tmp";
+  options.model_token = "NNAPIDelegate.StatefulDelegateWithCompilationCaching";
+
+  FloatAddOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
 class FloatMulOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatMulOpModel(const TensorData& input1, const TensorData& input2,
@@ -533,6 +568,150 @@ TEST(ConvolutionOpTest, SimpleTestQuantizedOutputMultiplierGreaterThan1) {
               ElementsAreArray(ArrayFloatNear(float_op.GetOutput(), 1)));
 }
 
+TEST(ConvolutionOpTest, SimpleTestFloatWithDilation) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 3;
+  const int dilation_height_factor = 3;
+  const Padding padding = Padding_VALID;
+  ConvolutionOpModel m(
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE, dilation_width_factor,
+      dilation_height_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // Zero bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
+class QuantizedConvolutionOpModel : public ConvolutionOpModel {
+ public:
+  using ConvolutionOpModel::ConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST(ConvolutionOpTest, SimpleTestQuantizedWithDilation) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 3;
+  const int dilation_height_factor = 3;
+  const Padding padding = Padding_VALID;
+  ConvolutionOpModel m({TensorType_UINT8,
+                        {image_batch_count, image_height, image_width, depth},
+                        0,
+                        127.5},
+                       {TensorType_UINT8,
+                        {depth, filter_size, filter_size, filter_count},
+                        0,
+                        127.5},
+                       {TensorType_UINT8, {}, 0, 255}, stride_width,
+                       stride_height, padding, ActivationFunctionType_NONE,
+                       dilation_width_factor, dilation_height_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // Zero bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetQuantizedOutput(),
+              ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
 class DepthwiseConvolutionOpModel : public SingleOpModelWithNNAPI {
  public:
   DepthwiseConvolutionOpModel(const TensorData& input, const TensorData& filter,
@@ -1849,12 +2028,14 @@ class PadOpModel : public SingleOpModelWithNNAPI {
     PopulateTensor<T>(input_, data);
   }
 
+  template <typename QuantizedInputOutput>
   void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<QuantizedInputOutput>(input_, data);
   }
 
+  template <typename QuantizedInputOutput>
   void SetQuantizedPadValue(float data) {
-    QuantizeAndPopulate<uint8_t>(constant_values_, {data});
+    QuantizeAndPopulate<QuantizedInputOutput>(constant_values_, {data});
   }
 
   void SetPaddings(std::initializer_list<int> paddings) {
@@ -1864,9 +2045,11 @@ class PadOpModel : public SingleOpModelWithNNAPI {
   std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  template <typename QuantizedInputOutput>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<QuantizedInputOutput>(
+        ExtractVector<QuantizedInputOutput>(output_), GetScale(output_),
+        GetZeroPoint(output_));
   }
 
  protected:
@@ -4450,6 +4633,357 @@ TEST(NNAPIDelegate, PReluQuantized) {
                                  kQuantizedTolerance)));
 }
 
+// Tests case where paddings is a const tensor. Type T is the dtype.
+template <typename T1>
+class PadV2OpConstModel : public PadOpModel<T1> {
+ public:
+  PadV2OpConstModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    std::initializer_list<int> paddings, T1 constant_values,
+                    const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ =
+        this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    this->constant_values_ =
+        this->AddConstInput(GetTensorType<T1>(), {constant_values}, {1});
+
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape});
+  }
+
+  PadV2OpConstModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    std::initializer_list<int> paddings,
+                    const TensorData& constant_values,
+                    const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ =
+        this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    this->constant_values_ = this->AddInput(constant_values);
+
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape});
+  }
+};
+
+// Test case where paddings is a non-const tensor.
+template <typename RegularInputOuput>
+class PadV2OpDynamicModel : public PadOpModel<RegularInputOuput> {
+ public:
+  PadV2OpDynamicModel(const TensorData& input,
+                      std::initializer_list<int> paddings_shape,
+                      RegularInputOuput constant_values,
+                      const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->constant_values_ = this->AddConstInput(
+        GetTensorType<RegularInputOuput>(), {constant_values}, {1});
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape, paddings_shape});
+  }
+  PadV2OpDynamicModel(const TensorData& input,
+                      std::initializer_list<int> paddings_shape,
+                      const TensorData& constant_values,
+                      const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->constant_values_ = this->AddInput(constant_values);
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape, paddings_shape});
+  }
+};
+
+TEST(PadV2OpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+                             {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTestUint8) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, Simple4DConstFloat32ValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                             {0, 1, 0, 0, 0, 0, 0, 1}, 5, {TensorType_FLOAT32});
+  m.SetInput({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 5, 3, 5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
+}
+
+TEST(PadV2OpTest, SimpleDynamicTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 0.0,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleDynamicValuedTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 5,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, AdvancedConstTest) {
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                             {0, 0, 0, 2, 1, 3, 0, 0}, 0, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(PadV2OpTest, AdvancedDynamicTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2}, 0,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+std::vector<testing::Matcher<float>> DequantizedArrayNear(
+    const std::vector<float>& values, const float min, const float max) {
+  const float quantization_tolerance = (max - min) / 255.0;
+  return ArrayFloatNear(values, quantization_tolerance);
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleConstTestV2() {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<integer_type> m(
+      {tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
+  m.template SetQuantizedPadValue<integer_type>(0);
+  m.Invoke();
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(QuantizedPadV2OpTest, UInt8SimpleConstTest) {
+  SimpleConstTestV2<uint8_t, TensorType_UINT8>();
+}
+TEST(QuantizedPadV2OpTest, Int8SimpleConstTest) {
+  SimpleConstTestV2<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleDynamicTestV2() {
+  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0},
+                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
+                                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
+  m.template SetQuantizedPadValue<integer_type>(0);
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(QuantizedPadV2OpTest, UInt8SimpleDynamicTest) {
+  SimpleDynamicTestV2<uint8_t, TensorType_UINT8>();
+}
+TEST(QuantizedPadV2OpTest, Int8SimpleDynamicTest) {
+  SimpleDynamicTestV2<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedConstTestV2() {
+  PadV2OpConstModel<integer_type> m(
+      {tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.template SetQuantizedPadValue<integer_type>(0);
+  m.Invoke();
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(QuantizedPadV2OpTest, UInt8AdvancedConstTest) {
+  AdvancedConstTestV2<uint8_t, TensorType_UINT8>();
+}
+TEST(QuantizedPadV2OpTest, Int8AdvancedConstTest) {
+  AdvancedConstTestV2<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedDynamicTestV2() {
+  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0},
+                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
+                                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.template SetQuantizedPadValue<integer_type>(0);
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(QuantizedPadV2OpTest, UInt8AdvancedDynamicTest) {
+  AdvancedDynamicTestV2<uint8_t, TensorType_UINT8>();
+}
+TEST(QuantizedPadV2OpTest, Int8AdvancedDynamicTest) {
+  AdvancedDynamicTestV2<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleConstValuedTest() {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<integer_type> m(
+      {tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
+  m.template SetQuantizedPadValue<integer_type>(-0.5);
+  m.Invoke();
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.5, -0.5, -0.5, -0.5, -0.8, 0.2, -0.5, -0.5, 0.9,
+                   0.7, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(QuantizedPadV2OpTest, UInt8SimpleConstValuedTest) {
+  SimpleConstValuedTest<uint8_t, TensorType_UINT8>();
+}
+TEST(QuantizedPadV2OpTest, Int8SimpleConstValuedTest) {
+  SimpleConstValuedTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void SimpleDynamicValuedTest() {
+  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0},
+                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
+                                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
+  m.template SetQuantizedPadValue<integer_type>(-0.5);
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.5, -0.5, -0.5, -0.5, -0.8, 0.2, -0.5, -0.5, 0.9,
+                   0.7, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(QuantizedPadV2OpTest, UInt8SimpleDynamicValuedTest) {
+  SimpleDynamicValuedTest<uint8_t, TensorType_UINT8>();
+}
+TEST(QuantizedPadV2OpTest, Int8SimpleDynamicValuedTest) {
+  SimpleDynamicValuedTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedConstValuedTest() {
+  PadV2OpConstModel<integer_type> m(
+      {tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.template SetQuantizedPadValue<integer_type>(-0.5);
+  m.Invoke();
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.8, 0.2,  0.9,  -0.5, -0.5, -0.5, -0.5, 0.7,  0.1,
+                   -0.3, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
+                   -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(QuantizedPadV2OpTest, UInt8AdvancedConstValuedTest) {
+  AdvancedConstValuedTest<uint8_t, TensorType_UINT8>();
+}
+TEST(QuantizedPadV2OpTest, Int8AdvancedConstValuedTest) {
+  AdvancedConstValuedTest<int8_t, TensorType_INT8>();
+}
+
+template <typename integer_type, TensorType tensor_dtype>
+void AdvancedDynamicValuedTest() {
+  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0},
+                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
+                                      {tensor_dtype, {}, -1.0, 1.0});
+  m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.template SetQuantizedPadValue<integer_type>(-0.5);
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.8, 0.2,  0.9,  -0.5, -0.5, -0.5, -0.5, 0.7,  0.1,
+                   -0.3, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
+                   -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(QuantizedPadV2OpTest, UInt8AdvancedDynamicValuedTest) {
+  AdvancedDynamicValuedTest<uint8_t, TensorType_UINT8>();
+}
+TEST(QuantizedPadV2OpTest, Int8AdvancedDynamicValuedTest) {
+  AdvancedDynamicValuedTest<int8_t, TensorType_INT8>();
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 88e5fd2c677..406c7526ce5 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # TensorFlow Lite Example Label Image.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
 
diff --git a/tensorflow/lite/examples/minimal/BUILD b/tensorflow/lite/examples/minimal/BUILD
index 498dbb9693e..907e02f1bc5 100644
--- a/tensorflow/lite/examples/minimal/BUILD
+++ b/tensorflow/lite/examples/minimal/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   TensorFlow Lite minimal example.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
 
diff --git a/tensorflow/lite/examples/python/BUILD b/tensorflow/lite/examples/python/BUILD
index 1ea487ac9fc..ed39c5fb421 100644
--- a/tensorflow/lite/examples/python/BUILD
+++ b/tensorflow/lite/examples/python/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 py_binary(
     name = "label_image",
diff --git a/tensorflow/lite/experimental/c/c_api.cc b/tensorflow/lite/experimental/c/c_api.cc
index 32610cfb0b6..67f826c8f61 100644
--- a/tensorflow/lite/experimental/c/c_api.cc
+++ b/tensorflow/lite/experimental/c/c_api.cc
@@ -53,14 +53,14 @@ class CallbackErrorReporter : public tflite::ErrorReporter {
 const char* TFL_Version() { return TFLITE_VERSION_STRING; }
 
 TFL_Model* TFL_NewModel(const void* model_data, size_t model_size) {
-  auto model = tflite::FlatBufferModel::BuildFromBuffer(
+  auto model = tflite::FlatBufferModel::VerifyAndBuildFromBuffer(
       static_cast<const char*>(model_data), model_size);
   std::shared_ptr<const tflite::FlatBufferModel> shared_model(model.release());
   return shared_model ? new TFL_Model{std::move(shared_model)} : nullptr;
 }
 
 TFL_Model* TFL_NewModelFromFile(const char* model_path) {
-  auto model = tflite::FlatBufferModel::BuildFromFile(model_path);
+  auto model = tflite::FlatBufferModel::VerifyAndBuildFromFile(model_path);
   std::shared_ptr<const tflite::FlatBufferModel> shared_model(model.release());
   return shared_model ? new TFL_Model{std::move(shared_model)} : nullptr;
 }
diff --git a/tensorflow/lite/experimental/c/c_api_test.cc b/tensorflow/lite/experimental/c/c_api_test.cc
index 782710d1115..176b15ee7da 100644
--- a/tensorflow/lite/experimental/c/c_api_test.cc
+++ b/tensorflow/lite/experimental/c/c_api_test.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <array>
-
 #include "tensorflow/lite/experimental/c/c_api.h"
 
+#include <array>
+#include <fstream>
+#include <vector>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/testing/util.h"
@@ -188,6 +190,38 @@ TEST(CApiSimple, ErrorReporter) {
   TFL_DeleteInterpreter(interpreter);
 }
 
+TEST(CApiSimple, ValidModel) {
+  std::ifstream model_file("tensorflow/lite/testdata/add.bin");
+
+  model_file.seekg(0, std::ios_base::end);
+  std::vector<char> model_buffer(model_file.tellg());
+
+  model_file.seekg(0, std::ios_base::beg);
+  model_file.read(model_buffer.data(), model_buffer.size());
+
+  TFL_Model* model = TFL_NewModel(model_buffer.data(), model_buffer.size());
+  ASSERT_NE(model, nullptr);
+  TFL_DeleteModel(model);
+}
+
+TEST(CApiSimple, ValidModelFromFile) {
+  TFL_Model* model =
+      TFL_NewModelFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+  TFL_DeleteModel(model);
+}
+
+TEST(CApiSimple, InvalidModel) {
+  std::vector<char> invalid_model(20, 'c');
+  TFL_Model* model = TFL_NewModel(invalid_model.data(), invalid_model.size());
+  ASSERT_EQ(model, nullptr);
+}
+
+TEST(CApiSimple, InvalidModelFromFile) {
+  TFL_Model* model = TFL_NewModelFromFile("invalid/path/foo.tflite");
+  ASSERT_EQ(model, nullptr);
+}
+
 }  // namespace
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index 42e132683b2..64aa5c5413a 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 41d308b9796..f3d7d54e2f6 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -1,8 +1,9 @@
 # TensorFlow Lite for iOS
 
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_IOS_BUILD_VERSION", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
index 3bbe68c1361..cb16346c757 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec
@@ -1,5 +1,3 @@
-# Run `pod lib lint TensorFlowLiteC.podspec` to ensure this is a valid spec.
-
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteC'
   s.version          = '0.2.0'
@@ -10,9 +8,11 @@ Pod::Spec.new do |s|
   s.summary          = 'TensorFlow Lite'
   s.description      = <<-DESC
 
-  TensorFlow Lite is TensorFlow's lightweight solution for mobile developers. It
-  enables low-latency inference of on-device machine learning models with a
-  small binary size and fast performance supporting hardware acceleration.
+  An internal-only pod containing the TensorFlow Lite C library that the public
+  `TensorFlowLiteSwift` and `TensorFlowLiteObjC` pods depend on. This pod is not
+  intended to be used directly. Swift developers should use the
+  `TensorFlowLiteSwift` pod and Objective-C developers should use the
+  `TensorFlowLiteObjC` pod.
                        DESC
 
   s.ios.deployment_target = '9.0'
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index bf4a007fb8c..bd25e36c786 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
diff --git a/tensorflow/lite/experimental/micro/BUILD b/tensorflow/lite/experimental/micro/BUILD
index b16b8b49f8a..c5fc95f2ff8 100644
--- a/tensorflow/lite/experimental/micro/BUILD
+++ b/tensorflow/lite/experimental/micro/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
     "tflite_micro_cc_test",
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 218b5de86a7..a5d23033b3d 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   TensorFlow Lite microcontroller example.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
index 1e684e1efd0..8ddb182af29 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
     "tflite_micro_cc_test",
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index 1ff15e9abf0..d60b74aad05 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load(
@@ -15,6 +16,7 @@ cc_library(
     srcs = [
         "conv.cc",
         "depthwise_conv.cc",
+        "elementwise.cc",
         "fully_connected.cc",
         "pooling.cc",
         "softmax.cc",
@@ -52,6 +54,7 @@ cc_library(
     name = "portable_optimized_micro_ops",
     srcs = [
         "conv.cc",
+        "elementwise.cc",
         "fully_connected.cc",
         "pooling.cc",
         "portable_optimized/depthwise_conv.cc",
@@ -86,6 +89,17 @@ cc_library(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "elementwise_test",
+    srcs = ["elementwise_test.cc"],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "pooling_test",
     srcs = [
diff --git a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
index cefa29bc0a2..6d4132f2650 100644
--- a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
@@ -42,6 +42,9 @@ TfLiteRegistration* Micro_Register_MAX_POOL_2D() {
   return Register_MAX_POOL_2D();
 }
 
+TfLiteRegistration* Register_ABS();
+TfLiteRegistration* Micro_Register_ABS() { return Register_ABS(); }
+
 AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
              Micro_Register_DEPTHWISE_CONV_2D());
@@ -52,6 +55,7 @@ AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_SOFTMAX, Micro_Register_SOFTMAX());
   AddBuiltin(BuiltinOperator_CONV_2D, Micro_Register_CONV_2D());
   AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Micro_Register_AVERAGE_POOL_2D());
+  AddBuiltin(BuiltinOperator_ABS, Micro_Register_ABS());
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/experimental/micro/kernels/elementwise.cc b/tensorflow/lite/experimental/micro/kernels/elementwise.cc
new file mode 100644
index 00000000000..774887451e8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/elementwise.cc
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace elementwise {
+namespace {
+
+bool IsNumericSupportedType(const TfLiteType type) {
+  return type == kTfLiteFloat32;
+}
+
+typedef bool (*IsSupportedType)(TfLiteType);
+template <IsSupportedType>
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  if (!IsSupportedType(input->type)) {
+    context->ReportError(context, "Current data type %d is not supported.",
+                         input->type);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
+                             T func(T), TfLiteType expected_type) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, expected_type);
+  const int64_t num_elements = NumElements(input);
+  const T* in_data = GetTensorData<T>(input);
+  T* out_data = GetTensorData<T>(output);
+  for (int64_t i = 0; i < num_elements; ++i) {
+    out_data[i] = func(in_data[i]);
+  }
+  return kTfLiteOk;
+}
+
+inline TfLiteStatus EvalNumeric(TfLiteContext* context, TfLiteNode* node,
+                                float float_func(float)) {
+  return EvalImpl<float>(context, node, float_func, kTfLiteFloat32);
+}
+
+TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::abs);
+}
+
+}  // namespace
+}  // namespace elementwise
+
+TfLiteRegistration* Register_ABS() {
+  static TfLiteRegistration r = {
+      /* init */ nullptr, /* free */ nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::AbsEval};
+  return &r;
+}
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/elementwise_test.cc b/tensorflow/lite/experimental/micro/kernels/elementwise_test.cc
new file mode 100644
index 00000000000..1ba98af5301
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/elementwise_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+
+void TestElementwiseFloat(std::initializer_list<int> input_dims_data,
+                          std::initializer_list<float> input_data,
+                          std::initializer_list<int> output_dims_data,
+                          std::initializer_list<float> expected_output_data,
+                          float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int input_size = 1;
+  constexpr int output_size = 1;
+  constexpr int tensors_size = input_size + output_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor")};
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < output_dims_count; ++i) {
+    output_data[i] = 23;
+  }
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+  tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_ABS, /* version= */ 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, nullptr, 0);
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInitializer({1, 0});
+  TfLiteIntArray* outputs_array = IntArrayFromInitializer({1, 1});
+  TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0});
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(Abs) {
+  constexpr int output_dims_count = 4;
+  float output_data[output_dims_count];
+  tflite::testing::TestElementwiseFloat(
+      {2, 2, 2},  // Input shape
+      {
+          0.01, -0.01, 10, -10,  // Input values
+      },
+      {2, 2, 2},  // Output shape
+      {
+          0.01, 0.01, 10, 10,  // Output values
+      },
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc
index f1c236fb62f..fdf800c1a5f 100644
--- a/tensorflow/lite/experimental/micro/micro_interpreter.cc
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
 
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/experimental/micro/compatibility.h"
 
@@ -67,8 +68,6 @@ MicroInterpreter::MicroInterpreter(const Model* model,
       tensor_allocator_(tensor_allocator),
       error_reporter_(error_reporter),
       initialization_status_(kTfLiteOk) {
-  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
-      model->buffers();
   auto* subgraphs = model->subgraphs();
   if (subgraphs->size() != 1) {
     error_reporter->Report("Only 1 subgraph is currently supported.\n");
@@ -79,70 +78,16 @@ MicroInterpreter::MicroInterpreter(const Model* model,
   tensors_ = subgraph_->tensors();
   operators_ = subgraph_->operators();
 
-  context_.tensors_size = tensors_->Length();
+  context_.tensors_size = tensors_->size();
   context_.tensors =
       reinterpret_cast<TfLiteTensor*>(tensor_allocator_->AllocateMemory(
           sizeof(TfLiteTensor) * context_.tensors_size, 4));
-  for (int i = 0; i < subgraph_->inputs()->Length(); ++i) {
-    const int tensor_index = subgraph_->inputs()->Get(i);
-    const auto* tensor = tensors_->Get(tensor_index);
-    initialization_status_ = tensor_allocator_->AllocateTensor(
-        *tensor, 0, operators_->Length(), buffers, error_reporter,
-        &context_.tensors[tensor_index]);
-    if (initialization_status_ != kTfLiteOk) {
-      return;
-    }
+
+  initialization_status_ = AllocateInputAndActTensors();
+  if (initialization_status_ != kTfLiteOk) {
+    return;
   }
 
-  int* first_created = reinterpret_cast<int*>(tensor_allocator_->AllocateMemory(
-      sizeof(int) * tensors_->Length(), sizeof(int)));
-  int* last_used = reinterpret_cast<int*>(tensor_allocator_->AllocateMemory(
-      sizeof(int) * tensors_->Length(), sizeof(int)));
-  for (int i = 0; i < tensors_->Length(); ++i) {
-    first_created[i] = -1;
-    last_used[i] = -1;
-  }
-
-  for (int i = (operators_->Length() - 1); i >= 0; --i) {
-    const auto* op = operators_->Get(i);
-    for (int n = 0; n < op->inputs()->Length(); ++n) {
-      const int tensor_index = op->inputs()->Get(n);
-      if ((last_used[tensor_index] == -1) || (last_used[tensor_index] < i)) {
-        last_used[tensor_index] = i;
-      }
-    }
-    for (int n = 0; n < op->outputs()->Length(); ++n) {
-      const int tensor_index = op->outputs()->Get(n);
-      const int create_before = i;
-      int destroy_after = last_used[tensor_index];
-      if (destroy_after == -1) {
-        destroy_after = operators_->Length();
-      }
-      const auto* tensor = tensors_->Get(tensor_index);
-      if (!tensor->is_variable()) {
-        initialization_status_ = tensor_allocator_->AllocateTensor(
-            *tensor, create_before, destroy_after, buffers, error_reporter,
-            &context_.tensors[tensor_index]);
-        if (initialization_status_ != kTfLiteOk) {
-          return;
-        }
-        first_created[tensor_index] = i;
-      }
-    }
-  }
-
-  for (int i = 0; i < tensors_->Length(); ++i) {
-    const auto* tensor = tensors_->Get(i);
-    const bool is_read_only = (first_created[i] == -1) && (last_used[i] != -1);
-    if (tensor->is_variable() || is_read_only) {
-      initialization_status_ = tensor_allocator_->AllocateTensor(
-          *tensor, 0, operators_->Length(), buffers, error_reporter,
-          &context_.tensors[i]);
-      if (initialization_status_ != kTfLiteOk) {
-        return;
-      }
-    }
-  }
   context_.impl_ = static_cast<void*>(this);
   context_.GetExecutionPlan = nullptr;
   context_.ResizeTensor = nullptr;
@@ -153,6 +98,83 @@ MicroInterpreter::MicroInterpreter(const Model* model,
   context_.recommended_num_threads = 1;
   context_.GetExternalContext = nullptr;
   context_.SetExternalContext = nullptr;
+
+  initialization_status_ = AllocateTemporaryTensors();
+  if (initialization_status_ != kTfLiteOk) {
+    return;
+  }
+}
+
+TfLiteStatus MicroInterpreter::AllocateInputAndActTensors() {
+  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
+      model_->buffers();
+  for (int i = 0; i < subgraph_->inputs()->size(); ++i) {
+    const int tensor_index = subgraph_->inputs()->Get(i);
+    const auto* tensor = tensors_->Get(tensor_index);
+    const TfLiteStatus status = tensor_allocator_->AllocateTensor(
+        *tensor, 0, operators_->size(), buffers, error_reporter_,
+        &context_.tensors[tensor_index]);
+    if (status != kTfLiteOk) {
+      return status;
+    }
+  }
+
+  int* first_created = reinterpret_cast<int*>(tensor_allocator_->AllocateMemory(
+      sizeof(int) * tensors_->size(), sizeof(int)));
+  int* last_used = reinterpret_cast<int*>(tensor_allocator_->AllocateMemory(
+      sizeof(int) * tensors_->size(), sizeof(int)));
+  for (int i = 0; i < tensors_->size(); ++i) {
+    first_created[i] = -1;
+    last_used[i] = -1;
+  }
+
+  for (int i = (operators_->size() - 1); i >= 0; --i) {
+    const auto* op = operators_->Get(i);
+    for (int n = 0; n < op->inputs()->size(); ++n) {
+      const int tensor_index = op->inputs()->Get(n);
+      if ((last_used[tensor_index] == -1) || (last_used[tensor_index] < i)) {
+        last_used[tensor_index] = i;
+      }
+    }
+    for (int n = 0; n < op->outputs()->size(); ++n) {
+      const int tensor_index = op->outputs()->Get(n);
+      const int create_before = i;
+      int destroy_after = last_used[tensor_index];
+      if (destroy_after == -1) {
+        destroy_after = operators_->size();
+      }
+      const auto* tensor = tensors_->Get(tensor_index);
+      if (!tensor->is_variable()) {
+        const TfLiteStatus status = tensor_allocator_->AllocateTensor(
+            *tensor, create_before, destroy_after, buffers, error_reporter_,
+            &context_.tensors[tensor_index]);
+        if (status != kTfLiteOk) {
+          return status;
+        }
+        first_created[tensor_index] = i;
+      }
+    }
+  }
+
+  for (int i = 0; i < tensors_->size(); ++i) {
+    const auto* tensor = tensors_->Get(i);
+    const bool is_read_only = (first_created[i] == -1) && (last_used[i] != -1);
+    if (tensor->is_variable() || is_read_only) {
+      const TfLiteStatus status = tensor_allocator_->AllocateTensor(
+          *tensor, 0, operators_->size(), buffers, error_reporter_,
+          &context_.tensors[i]);
+      if (status != kTfLiteOk) {
+        return status;
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroInterpreter::AllocateTemporaryTensors() {
+  // TBD(wangtz) : Implement this method.
+  return kTfLiteOk;
 }
 
 TfLiteStatus MicroInterpreter::Invoke() {
@@ -162,7 +184,7 @@ TfLiteStatus MicroInterpreter::Invoke() {
   }
   TfLiteStatus status = kTfLiteOk;
   auto opcodes = model_->operator_codes();
-  for (int i = 0; i < operators_->Length(); ++i) {
+  for (int i = 0; i < operators_->size(); ++i) {
     const auto* op = operators_->Get(i);
     int index = op->opcode_index();
     if (index < 0 || index >= opcodes->size()) {
@@ -220,12 +242,12 @@ TfLiteStatus MicroInterpreter::Invoke() {
     int inputs_data[kMaxInputs + 1];
     TfLiteIntArray* inputs_array =
         reinterpret_cast<TfLiteIntArray*>(inputs_data);
-    if (op->inputs()->Length() >= kMaxInputs) {
-      error_reporter_->Report("Too many inputs (%d)\n", op->inputs()->Length());
+    if (op->inputs()->size() >= kMaxInputs) {
+      error_reporter_->Report("Too many inputs (%d)\n", op->inputs()->size());
       return kTfLiteError;
     }
-    inputs_array->size = op->inputs()->Length();
-    for (int n = 0; n < op->inputs()->Length(); ++n) {
+    inputs_array->size = op->inputs()->size();
+    for (int n = 0; n < op->inputs()->size(); ++n) {
       inputs_array->data[n] = op->inputs()->Get(n);
     }
 
@@ -233,13 +255,12 @@ TfLiteStatus MicroInterpreter::Invoke() {
     int outputs_data[kMaxOutputs + 1];
     TfLiteIntArray* outputs_array =
         reinterpret_cast<TfLiteIntArray*>(outputs_data);
-    if (op->outputs()->Length() >= kMaxOutputs) {
-      error_reporter_->Report("Too many outputs (%d)\n",
-                              op->outputs()->Length());
+    if (op->outputs()->size() >= kMaxOutputs) {
+      error_reporter_->Report("Too many outputs (%d)\n", op->outputs()->size());
       return kTfLiteError;
     }
-    outputs_array->size = op->outputs()->Length();
-    for (int n = 0; n < op->outputs()->Length(); ++n) {
+    outputs_array->size = op->outputs()->size();
+    for (int n = 0; n < op->outputs()->size(); ++n) {
       outputs_array->data[n] = op->outputs()->Get(n);
     }
 
@@ -287,7 +308,7 @@ TfLiteStatus MicroInterpreter::Invoke() {
 
 TfLiteTensor* MicroInterpreter::input(int index) {
   const flatbuffers::Vector<int32_t>* inputs = subgraph_->inputs();
-  const size_t length = inputs->Length();
+  const size_t length = inputs->size();
   if ((index < 0) || (index >= length)) {
     error_reporter_->Report("Input index %d out of range (length is %d)", index,
                             length);
@@ -298,8 +319,8 @@ TfLiteTensor* MicroInterpreter::input(int index) {
 
 TfLiteTensor* MicroInterpreter::output(int index) {
   const flatbuffers::Vector<int32_t>* outputs = subgraph_->outputs();
-  const size_t length = outputs->Length();
-  if ((index < 0) || (index >= outputs->Length())) {
+  const size_t length = outputs->size();
+  if ((index < 0) || (index >= outputs->size())) {
     error_reporter_->Report("Output index %d out of range (length is %d)",
                             index, length);
     return nullptr;
diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.h b/tensorflow/lite/experimental/micro/micro_interpreter.h
index 6450dcce962..04d9c7cba8d 100644
--- a/tensorflow/lite/experimental/micro/micro_interpreter.h
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.h
@@ -53,6 +53,9 @@ class MicroInterpreter {
   ErrorReporter* error_reporter() { return error_reporter_; }
 
  private:
+  TfLiteStatus AllocateInputAndActTensors();
+  TfLiteStatus AllocateTemporaryTensors();
+
   const Model* model_;
   const OpResolver& op_resolver_;
   SimpleTensorAllocator* tensor_allocator_;
diff --git a/tensorflow/lite/experimental/micro/testing/BUILD b/tensorflow/lite/experimental/micro/testing/BUILD
index 1623df5b865..97abc299992 100644
--- a/tensorflow/lite/experimental/micro/testing/BUILD
+++ b/tensorflow/lite/experimental/micro/testing/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["test_linux_binary.sh"])
 
 cc_library(
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index ed8ee6e216d..6e100987b7d 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -63,7 +63,7 @@ MICROLITE_LIBS := -lm
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
 CXXFLAGS := -O3 -DNDEBUG
-CXXFLAGS += --std=c++11 -g -DTF_LITE_STATIC_MEMORY
+CXXFLAGS += -std=c++11 -g -DTF_LITE_STATIC_MEMORY
 CCFLAGS := -DNDEBUG -g -DTF_LITE_STATIC_MEMORY
 LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index e5f18059676..1b0ba65c581 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -5,7 +5,7 @@ package(default_visibility = ["//visibility:private"])
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_unit_test")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
 
 SOURCES = glob([
     "sources/*.h",
@@ -58,7 +58,6 @@ objc_library(
     srcs = SOURCES,
     hdrs = API_HEADERS,
     copts = RELEASE_COPTS,
-    module_map = "apis/module.modulemap",
     tags = TFL_DEFAULT_TAGS,
     deps = [
         "//tensorflow/lite/experimental/c:c_api",
@@ -67,17 +66,17 @@ objc_library(
 )
 
 ios_unit_test(
-    name = "TensorFlowLiteTests",
+    name = "Tests",
     size = "medium",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
     deps = [
-        ":TestsLib",
+        ":TestsLibrary",
     ],
 )
 
 objc_library(
-    name = "TestsLib",
+    name = "TestsLibrary",
     testonly = 1,
     srcs = glob([
         "tests/*.m",
@@ -97,3 +96,41 @@ objc_library(
         ":TensorFlowLite",
     ],
 )
+
+ios_application(
+    name = "TestApp",
+    app_icons = glob(["apps/TestApp/TestApp/Assets.xcassets/AppIcon.appiconset/**"]),
+    bundle_id = "com.tensorflow.lite.objc.TestApp",
+    families = [
+        "ipad",
+        "iphone",
+    ],
+    infoplists = ["apps/TestApp/TestApp/Info.plist"],
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    sdk_frameworks = [
+        "CoreGraphics",
+    ],
+    tags = TFL_DEFAULT_TAGS,
+    deps = [
+        ":TestAppLibrary",
+    ],
+)
+
+objc_library(
+    name = "TestAppLibrary",
+    srcs = glob(["apps/TestApp/TestApp/*.m"]),
+    hdrs = glob(["apps/TestApp/TestApp/*.h"]),
+    data = glob(["apps/TestApp/TestApp/Base.lproj/*.storyboard"]) + [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    includes = [
+        "apis",
+    ],
+    module_name = "TestApp",
+    tags = TFL_DEFAULT_TAGS + ["manual"],
+    deps = [
+        ":TensorFlowLite",
+    ],
+)
diff --git a/tensorflow/lite/experimental/objc/README.md b/tensorflow/lite/experimental/objc/README.md
index e0788e61c62..e6b30fc94fc 100644
--- a/tensorflow/lite/experimental/objc/README.md
+++ b/tensorflow/lite/experimental/objc/README.md
@@ -74,10 +74,10 @@ Build the `TensorFlowLite` Objective-C library target:
 bazel build tensorflow/lite/experimental/objc:TensorFlowLite
 ```
 
-Build the `TensorFlowLiteTests` target:
+Build the `Tests` target:
 
 ```shell
-bazel test tensorflow/lite/experimental/objc:TensorFlowLiteTests
+bazel test tensorflow/lite/experimental/objc:Tests
 ```
 
 #### Generate the Xcode project using Tulsi
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen b/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
index 04a2a2c19cd..feacdbad8de 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
+++ b/tensorflow/lite/experimental/objc/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
@@ -4,6 +4,8 @@
     "tensorflow/lite/experimental/c",
     "tensorflow/lite/experimental/objc",
     "tensorflow/lite/experimental/objc/apis",
+    "tensorflow/lite/experimental/objc/apps/TestApp/TestApp",
+    "tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Base.lproj",
     "tensorflow/lite/experimental/objc/sources",
     "tensorflow/lite/experimental/objc/tests",
     "tensorflow/lite/kernels",
@@ -13,7 +15,8 @@
   ],
   "buildTargets" : [
     "//tensorflow/lite/experimental/objc:TensorFlowLite",
-    "//tensorflow/lite/experimental/objc:TensorFlowLiteTests",
+    "//tensorflow/lite/experimental/objc:TestApp",
+    "//tensorflow/lite/experimental/objc:Tests",
   ],
   "projectName" : "TensorFlowLite",
   "optionSet" : {
diff --git a/tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h b/tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h
index ad1ffe6845f..7ba3f9782be 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h
@@ -12,7 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#import <Foundation/Foundation.h>
+
 #import "TFLInterpreter.h"
 #import "TFLInterpreterOptions.h"
 #import "TFLQuantizationParameters.h"
 #import "TFLTensor.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * A string describing the semantic versioning information for the TensorFlow Lite runtime. Is an
+ * empty string if the version could not be determined.
+ */
+FOUNDATION_EXPORT NSString *const TFLVersion;
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/framework.modulemap b/tensorflow/lite/experimental/objc/apis/framework.modulemap
index 57113ffab1f..c360b20ab86 100644
--- a/tensorflow/lite/experimental/objc/apis/framework.modulemap
+++ b/tensorflow/lite/experimental/objc/apis/framework.modulemap
@@ -1,10 +1,5 @@
 framework module TFLTensorFlowLite {
     umbrella header "TFLTensorFlowLite.h"
 
-    header "TFLInterpreter.h"
-    header "TFLInterpreterOptions.h"
-    header "TFLQuantizationParameters.h"
-    header "TFLTensor.h"
-
     export *
 }
diff --git a/tensorflow/lite/experimental/objc/apis/module.modulemap b/tensorflow/lite/experimental/objc/apis/module.modulemap
deleted file mode 100644
index 37b98478f60..00000000000
--- a/tensorflow/lite/experimental/objc/apis/module.modulemap
+++ /dev/null
@@ -1,10 +0,0 @@
-module TFLTensorFlowLite {
-    umbrella header "TFLTensorFlowLite.h"
-
-    header "TFLInterpreter.h"
-    header "TFLInterpreterOptions.h"
-    header "TFLQuantizationParameters.h"
-    header "TFLTensor.h"
-
-    export *
-}
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/Podfile b/tensorflow/lite/experimental/objc/apps/TestApp/Podfile
new file mode 100644
index 00000000000..7060500690d
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/Podfile
@@ -0,0 +1,5 @@
+platform :ios, '9.0'
+
+target 'TestApp' do
+  pod 'TensorFlowLiteObjC'
+end
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp.xcodeproj/project.pbxproj b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp.xcodeproj/project.pbxproj
new file mode 100644
index 00000000000..9ce01df5b10
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp.xcodeproj/project.pbxproj
@@ -0,0 +1,352 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		B20F5F3622937C9000A4FBD8 /* add_quantized.bin in Resources */ = {isa = PBXBuildFile; fileRef = B20F5F3322937C8D00A4FBD8 /* add_quantized.bin */; };
+		B20F5F3722937C9000A4FBD8 /* add.bin in Resources */ = {isa = PBXBuildFile; fileRef = B20F5F3422937C8F00A4FBD8 /* add.bin */; };
+		B20F5F3822937C9000A4FBD8 /* multi_add.bin in Resources */ = {isa = PBXBuildFile; fileRef = B20F5F3522937C8F00A4FBD8 /* multi_add.bin */; };
+		B210BD922291D78D00572163 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = B210BD912291D78D00572163 /* AppDelegate.m */; };
+		B210BD952291D78D00572163 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = B210BD942291D78D00572163 /* ViewController.m */; };
+		B210BD982291D78D00572163 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = B210BD962291D78D00572163 /* Main.storyboard */; };
+		B210BD9A2291D78E00572163 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = B210BD992291D78E00572163 /* Assets.xcassets */; };
+		B210BD9D2291D78F00572163 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = B210BD9B2291D78F00572163 /* LaunchScreen.storyboard */; };
+		B210BDA02291D78F00572163 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = B210BD9F2291D78F00572163 /* main.m */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		B20F5F3322937C8D00A4FBD8 /* add_quantized.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_quantized.bin; path = ../../../../testdata/add_quantized.bin; sourceTree = "<group>"; };
+		B20F5F3422937C8F00A4FBD8 /* add.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add.bin; path = ../../../../testdata/add.bin; sourceTree = "<group>"; };
+		B20F5F3522937C8F00A4FBD8 /* multi_add.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = multi_add.bin; path = ../../../../testdata/multi_add.bin; sourceTree = "<group>"; };
+		B210BD8D2291D78D00572163 /* TestApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TestApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		B210BD902291D78D00572163 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		B210BD912291D78D00572163 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
+		B210BD932291D78D00572163 /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = "<group>"; };
+		B210BD942291D78D00572163 /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = "<group>"; };
+		B210BD972291D78D00572163 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		B210BD992291D78E00572163 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		B210BD9C2291D78F00572163 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		B210BD9E2291D78F00572163 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		B210BD9F2291D78F00572163 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		B210BD8A2291D78D00572163 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		B210BD842291D78D00572163 = {
+			isa = PBXGroup;
+			children = (
+				B20F5F3322937C8D00A4FBD8 /* add_quantized.bin */,
+				B20F5F3422937C8F00A4FBD8 /* add.bin */,
+				B20F5F3522937C8F00A4FBD8 /* multi_add.bin */,
+				B210BD8F2291D78D00572163 /* TestApp */,
+				B210BD8E2291D78D00572163 /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		B210BD8E2291D78D00572163 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				B210BD8D2291D78D00572163 /* TestApp.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		B210BD8F2291D78D00572163 /* TestApp */ = {
+			isa = PBXGroup;
+			children = (
+				B210BD902291D78D00572163 /* AppDelegate.h */,
+				B210BD912291D78D00572163 /* AppDelegate.m */,
+				B210BD932291D78D00572163 /* ViewController.h */,
+				B210BD942291D78D00572163 /* ViewController.m */,
+				B210BD962291D78D00572163 /* Main.storyboard */,
+				B210BD992291D78E00572163 /* Assets.xcassets */,
+				B210BD9B2291D78F00572163 /* LaunchScreen.storyboard */,
+				B210BD9E2291D78F00572163 /* Info.plist */,
+				B210BD9F2291D78F00572163 /* main.m */,
+			);
+			path = TestApp;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		B210BD8C2291D78D00572163 /* TestApp */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = B210BDA32291D78F00572163 /* Build configuration list for PBXNativeTarget "TestApp" */;
+			buildPhases = (
+				B210BD892291D78D00572163 /* Sources */,
+				B210BD8A2291D78D00572163 /* Frameworks */,
+				B210BD8B2291D78D00572163 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = TestApp;
+			productName = TestApp;
+			productReference = B210BD8D2291D78D00572163 /* TestApp.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		B210BD852291D78D00572163 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 1010;
+				ORGANIZATIONNAME = "Google Inc";
+				TargetAttributes = {
+					B210BD8C2291D78D00572163 = {
+						CreatedOnToolsVersion = 10.1;
+					};
+				};
+			};
+			buildConfigurationList = B210BD882291D78D00572163 /* Build configuration list for PBXProject "TestApp" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = B210BD842291D78D00572163;
+			productRefGroup = B210BD8E2291D78D00572163 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				B210BD8C2291D78D00572163 /* TestApp */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		B210BD8B2291D78D00572163 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				B210BD9D2291D78F00572163 /* LaunchScreen.storyboard in Resources */,
+				B20F5F3622937C9000A4FBD8 /* add_quantized.bin in Resources */,
+				B20F5F3722937C9000A4FBD8 /* add.bin in Resources */,
+				B210BD9A2291D78E00572163 /* Assets.xcassets in Resources */,
+				B210BD982291D78D00572163 /* Main.storyboard in Resources */,
+				B20F5F3822937C9000A4FBD8 /* multi_add.bin in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		B210BD892291D78D00572163 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				B210BD952291D78D00572163 /* ViewController.m in Sources */,
+				B210BDA02291D78F00572163 /* main.m in Sources */,
+				B210BD922291D78D00572163 /* AppDelegate.m in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		B210BD962291D78D00572163 /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				B210BD972291D78D00572163 /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		B210BD9B2291D78F00572163 /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				B210BD9C2291D78F00572163 /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		B210BDA12291D78F00572163 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+			};
+			name = Debug;
+		};
+		B210BDA22291D78F00572163 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		B210BDA42291D78F00572163 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				INFOPLIST_FILE = TestApp/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.objc.TestApp;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		B210BDA52291D78F00572163 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				INFOPLIST_FILE = TestApp/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.objc.TestApp;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		B210BD882291D78D00572163 /* Build configuration list for PBXProject "TestApp" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				B210BDA12291D78F00572163 /* Debug */,
+				B210BDA22291D78F00572163 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		B210BDA32291D78F00572163 /* Build configuration list for PBXNativeTarget "TestApp" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				B210BDA42291D78F00572163 /* Debug */,
+				B210BDA52291D78F00572163 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = B210BD852291D78D00572163 /* Project object */;
+}
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/AppDelegate.h b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/AppDelegate.h
new file mode 100644
index 00000000000..a8442869d47
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/AppDelegate.h
@@ -0,0 +1,25 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+@property(nonatomic) UIWindow *window;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/AppDelegate.m b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/AppDelegate.m
new file mode 100644
index 00000000000..06bb8d9b610
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/AppDelegate.m
@@ -0,0 +1,22 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "AppDelegate.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation AppDelegate
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Assets.xcassets/AppIcon.appiconset/Contents.json
similarity index 100%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json
rename to tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Assets.xcassets/AppIcon.appiconset/Contents.json
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Assets.xcassets/Contents.json
similarity index 100%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json
rename to tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Assets.xcassets/Contents.json
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard
new file mode 100644
index 00000000000..6c97d768e15
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <layoutGuides>
+                        <viewControllerLayoutGuide type="top" id="Y0Z-8F-bB8"/>
+                        <viewControllerLayoutGuide type="bottom" id="TqJ-Hq-gHs"/>
+                    </layoutGuides>
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="TensorFlow Lite Test" textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" minimumScaleFactor="0.25" translatesAutoresizingMaskIntoConstraints="NO" id="zIC-MS-HeK">
+                                <rect key="frame" x="16" y="314" width="343" height="39"/>
+                                <fontDescription key="fontDescription" type="boldSystem" pointSize="32"/>
+                                <color key="textColor" red="1" green="0.50329624702372611" blue="0.013296667412401542" alpha="0.84705882352941175" colorSpace="custom" customColorSpace="displayP3"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstAttribute="trailing" secondItem="zIC-MS-HeK" secondAttribute="trailing" constant="16" id="1Wp-jb-ol6"/>
+                            <constraint firstItem="zIC-MS-HeK" firstAttribute="centerY" secondItem="Ze5-6b-2t3" secondAttribute="centerY" id="R2T-Hp-TBa"/>
+                            <constraint firstItem="zIC-MS-HeK" firstAttribute="leading" secondItem="Ze5-6b-2t3" secondAttribute="leading" constant="16" id="alE-O6-WL6"/>
+                        </constraints>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="52" y="374.66266866566718"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Base.lproj/Main.storyboard b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Base.lproj/Main.storyboard
new file mode 100644
index 00000000000..602ef636aa9
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Base.lproj/Main.storyboard
@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <deployment version="2304" identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController id="BYZ-38-t0r" customClass="ViewController" sceneMemberID="viewController">
+                    <layoutGuides>
+                        <viewControllerLayoutGuide type="top" id="5Qd-iD-SiH"/>
+                        <viewControllerLayoutGuide type="bottom" id="zTS-de-uge"/>
+                    </layoutGuides>
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="ahi-i4-2FP" userLabel="Top Model Toolbar">
+                                <rect key="frame" x="0.0" y="34" width="375" height="44"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="44" id="nwy-nk-0wZ"/>
+                                </constraints>
+                                <items>
+                                    <barButtonItem style="plain" id="Ywd-KS-s96">
+                                        <segmentedControl key="customView" opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="left" contentVerticalAlignment="top" segmentControlStyle="bar" selectedSegmentIndex="0" id="8kc-88-CHj" userLabel="Model Control">
+                                            <rect key="frame" x="16" y="7.5" width="343" height="29"/>
+                                            <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                            <segments>
+                                                <segment title="Add"/>
+                                                <segment title="AddQuantized"/>
+                                                <segment title="MultiAdd"/>
+                                            </segments>
+                                            <connections>
+                                                <action selector="modelChanged:" destination="BYZ-38-t0r" eventType="valueChanged" id="z13-8K-EwC"/>
+                                            </connections>
+                                        </segmentedControl>
+                                    </barButtonItem>
+                                </items>
+                            </toolbar>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="UWb-3E-O5r" userLabel="Bottom Invoke Toolbar">
+                                <rect key="frame" x="0.0" y="140" width="375" height="44"/>
+                                <items>
+                                    <barButtonItem title="Invoke Interpreter" width="374" id="He4-7G-biW">
+                                        <connections>
+                                            <action selector="invokeInterpreter:" destination="BYZ-38-t0r" id="Ycs-E9-Vul"/>
+                                        </connections>
+                                    </barButtonItem>
+                                </items>
+                            </toolbar>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" misplaced="YES" editable="NO" adjustsFontForContentSizeCategory="YES" selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="7Ws-3t-76I">
+                                <rect key="frame" x="0.0" y="194" width="375" height="488"/>
+                                <color key="backgroundColor" red="0.12820077356385221" green="0.40366933178860925" blue="0.96080166101455688" alpha="1" colorSpace="custom" customColorSpace="displayP3"/>
+                                <color key="textColor" cocoaTouchSystemColor="tableCellGroupedBackgroundColor"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstItem="ahi-i4-2FP" firstAttribute="top" secondItem="5Qd-iD-SiH" secondAttribute="bottom" constant="14" id="0V2-16-9cM"/>
+                            <constraint firstAttribute="trailing" secondItem="ahi-i4-2FP" secondAttribute="trailing" id="1D2-FC-OQ0"/>
+                            <constraint firstItem="UWb-3E-O5r" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="1To-8n-Knb"/>
+                            <constraint firstItem="7Ws-3t-76I" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="3Et-px-WCV"/>
+                            <constraint firstItem="zTS-de-uge" firstAttribute="top" secondItem="7Ws-3t-76I" secondAttribute="bottom" id="Lkb-XF-ldX"/>
+                            <constraint firstItem="7Ws-3t-76I" firstAttribute="top" secondItem="UWb-3E-O5r" secondAttribute="bottom" id="bXr-pF-Ld2"/>
+                            <constraint firstItem="ahi-i4-2FP" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="c98-HO-3m5"/>
+                            <constraint firstAttribute="trailing" secondItem="UWb-3E-O5r" secondAttribute="trailing" id="oJz-hf-dJa"/>
+                            <constraint firstAttribute="trailing" secondItem="7Ws-3t-76I" secondAttribute="trailing" id="oyy-C7-mUJ"/>
+                            <constraint firstItem="UWb-3E-O5r" firstAttribute="top" secondItem="ahi-i4-2FP" secondAttribute="bottom" constant="62" id="uFg-MF-aJz"/>
+                        </constraints>
+                    </view>
+                    <connections>
+                        <outlet property="invokeButton" destination="He4-7G-biW" id="kpj-CS-Fss"/>
+                        <outlet property="modelControl" destination="8kc-88-CHj" id="GTB-WG-ozW"/>
+                        <outlet property="resultsTextView" destination="7Ws-3t-76I" id="fnd-i5-Pdh"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="136.80000000000001" y="132.68365817091455"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Info.plist b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Info.plist
new file mode 100644
index 00000000000..b16c1e9fb58
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/Info.plist
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>0.0.1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.h b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.h
new file mode 100644
index 00000000000..797c964dd4e
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.h
@@ -0,0 +1,22 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface ViewController : UIViewController
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.m b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.m
new file mode 100644
index 00000000000..2b805f0d1be
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.m
@@ -0,0 +1,419 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "ViewController.h"
+
+#import "TFLTensorFlowLite.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Safely dispatches the given `block` on the main thread. If already on the main thread, the given
+ * block is executed immediately; otherwise, dispatches the block asynchronously on the main thread.
+ *
+ * @param block The block to dispatch on the main thread.
+ */
+void TLTSafeDispatchOnMain(dispatch_block_t block) {
+  if (block == nil) return;
+  if (NSThread.isMainThread) {
+    block();
+  } else {
+    dispatch_async(dispatch_get_main_queue(), block);
+  }
+}
+
+/**
+ * Name of a float model that performs two add operations on one input tensor and returns the result
+ * in one output tensor.
+ */
+static NSString *const kModelNameAdd = @"add";
+
+/**
+ * Name of a quantized model that performs two add operations on one input tensor and returns the
+ * result in one output tensor.
+ */
+static NSString *const kModelNameAddQuantized = @"add_quantized";
+
+/**
+ * Name of a float model that performs three add operations on four input tensors and returns the
+ * results in 2 output tensors.
+ */
+static NSString *const kModelNameMultiAdd = @"multi_add";
+
+/** Model resource type. */
+static NSString *const kModelType = @"bin";
+
+/** The label for the serial queue for synchronizing interpreter calls. */
+static const char *kInterpreterSerialQueueLabel = "com.tensorflow.lite.objc.testapp.interpreter";
+
+static NSString *const kNilInterpreterError =
+    @"Failed to invoke the interpreter because the interpreter was nil.";
+static NSString *const kInvokeInterpreterError = @"Failed to invoke interpreter due to error: %@.";
+
+/** Model paths. */
+static NSArray *gModelPaths;
+
+@interface ViewController ()
+
+/** Serial queue for synchronizing interpreter calls. */
+@property(nonatomic) dispatch_queue_t interpreterSerialQueue;
+
+/** TensorFlow Lite interpreter for the currently selected model. */
+@property(nonatomic) TFLInterpreter *interpreter;
+
+@property(weak, nonatomic) IBOutlet UISegmentedControl *modelControl;
+@property(weak, nonatomic) IBOutlet UIBarButtonItem *invokeButton;
+@property(weak, nonatomic) IBOutlet UITextView *resultsTextView;
+
+@end
+
+@implementation ViewController
+
+#pragma mark - NSObject
+
++ (void)initialize {
+  if (self == [ViewController self]) {
+    gModelPaths = @[
+      [NSBundle.mainBundle pathForResource:kModelNameAdd ofType:kModelType],
+      [NSBundle.mainBundle pathForResource:kModelNameAddQuantized ofType:kModelType],
+      [NSBundle.mainBundle pathForResource:kModelNameMultiAdd ofType:kModelType],
+    ];
+  }
+}
+
+#pragma mark - UIViewController
+
+- (void)viewDidLoad {
+  [super viewDidLoad];
+  self.interpreterSerialQueue =
+      dispatch_queue_create(kInterpreterSerialQueueLabel, DISPATCH_QUEUE_SERIAL);
+  self.invokeButton.enabled = NO;
+  [self updateResultsText:[NSString stringWithFormat:@"Using TensorFlow Lite runtime version %@.",
+                                                     TFLVersion]];
+  [self loadModel];
+}
+
+#pragma mark - IBActions
+
+- (IBAction)modelChanged:(id)sender {
+  self.invokeButton.enabled = NO;
+  NSString *results = [NSString
+      stringWithFormat:@"Switched to the %@ model.",
+                       [self.modelControl
+                           titleForSegmentAtIndex:self.modelControl.selectedSegmentIndex]];
+  [self updateResultsText:results];
+  [self loadModel];
+}
+
+- (IBAction)invokeInterpreter:(id)sender {
+  switch (self.modelControl.selectedSegmentIndex) {
+    case 0:
+      [self invokeAdd];
+      break;
+    case 1:
+      [self invokeAddQuantized];
+      break;
+    case 2:
+      [self invokeMultiAdd];
+  }
+}
+
+#pragma mark - Private
+
+/** Path of the currently selected model. */
+- (nullable NSString *)currentModelPath {
+  return self.modelControl.selectedSegmentIndex == UISegmentedControlNoSegment
+             ? nil
+             : gModelPaths[self.modelControl.selectedSegmentIndex];
+}
+
+- (void)loadModel {
+  NSString *modelPath = [self currentModelPath];
+  if (modelPath.length == 0) {
+    [self updateResultsText:@"No model is selected."];
+    return;
+  }
+
+  __weak typeof(self) weakSelf = self;
+  dispatch_async(self.interpreterSerialQueue, ^{
+    TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+    options.numberOfThreads = 2;
+
+    NSError *error;
+    weakSelf.interpreter = [[TFLInterpreter alloc] initWithModelPath:modelPath
+                                                             options:options
+                                                               error:&error];
+    if (weakSelf.interpreter == nil || error != nil) {
+      NSString *results =
+          [NSString stringWithFormat:@"Failed to create the interpreter due to error:%@",
+                                     error.localizedDescription];
+      [weakSelf updateResultsText:results];
+    } else {
+      TLTSafeDispatchOnMain(^{
+        weakSelf.invokeButton.enabled = YES;
+      });
+    }
+  });
+}
+
+- (void)invokeAdd {
+  __weak typeof(self) weakSelf = self;
+  dispatch_async(self.interpreterSerialQueue, ^{
+    if (weakSelf.interpreter == nil) {
+      [weakSelf updateResultsText:kNilInterpreterError];
+      return;
+    }
+
+    NSArray<NSNumber *> *shape = @[@2];
+    NSError *error;
+
+    if (![weakSelf.interpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    if (![weakSelf.interpreter allocateTensorsWithError:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    TFLTensor *inputTensor = [weakSelf.interpreter inputTensorAtIndex:0 error:&error];
+    if (inputTensor == nil || error != nil) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+    float one = 1.f;
+    float three = 3.f;
+    [inputData appendBytes:&one length:sizeof(float)];
+    [inputData appendBytes:&three length:sizeof(float)];
+    if (![inputTensor copyData:inputData error:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    if (![weakSelf.interpreter invokeWithError:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    TFLTensor *outputTensor = [weakSelf.interpreter outputTensorAtIndex:0 error:&error];
+    if (outputTensor == nil || error != nil) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    NSData *outputData = [outputTensor dataWithError:&error];
+    if (outputData == nil || error != nil) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+    float output[2U];
+    [outputData getBytes:output length:(sizeof(float) * 2U)];
+
+    [weakSelf
+        updateResultsText:[NSString stringWithFormat:@"Performing 2 add operations:\n\nInput = "
+                                                     @"[%.1f, %.1f]\n\nOutput = [%.1f, %.1f]",
+                                                     one, three, output[0], output[1]]];
+  });
+}
+
+- (void)invokeAddQuantized {
+  __weak typeof(self) weakSelf = self;
+  dispatch_async(self.interpreterSerialQueue, ^{
+    if (weakSelf.interpreter == nil) {
+      [weakSelf updateResultsText:kNilInterpreterError];
+      return;
+    }
+
+    NSArray<NSNumber *> *shape = @[@2];
+    NSError *error;
+
+    if (![weakSelf.interpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    if (![weakSelf.interpreter allocateTensorsWithError:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    TFLTensor *inputTensor = [weakSelf.interpreter inputTensorAtIndex:0 error:&error];
+    if (inputTensor == nil || error != nil) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+    uint8_t one = 1U;
+    uint8_t three = 3U;
+    [inputData appendBytes:&one length:sizeof(uint8_t)];
+    [inputData appendBytes:&three length:sizeof(uint8_t)];
+    if (![inputTensor copyData:inputData error:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    if (![weakSelf.interpreter invokeWithError:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    TFLTensor *outputTensor = [weakSelf.interpreter outputTensorAtIndex:0 error:&error];
+    if (outputTensor == nil || error != nil) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    TFLQuantizationParameters *params = outputTensor.quantizationParameters;
+    if (params == nil) {
+      [weakSelf updateResultsText:
+                    [NSString stringWithFormat:kInvokeInterpreterError,
+                                               @"Missing qualitization parameters in the output"]];
+      return;
+    }
+
+    NSData *outputData = [outputTensor dataWithError:&error];
+    if (outputData == nil || error != nil) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+    uint8_t output[2U];
+    [outputData getBytes:output length:(sizeof(uint8_t) * 2U)];
+    float dequantized[2U];
+    dequantized[0] = params.scale * (output[0] - params.zeroPoint);
+    dequantized[1] = params.scale * (output[1] - params.zeroPoint);
+
+    [weakSelf updateResultsText:
+                  [NSString stringWithFormat:@"Performing 2 add operations on quantized input:\n\n"
+                                             @"Input = [%d, %d]\n\nQuantized Output = [%d, %d]\n\n"
+                                             @"Dequantized Output = [%f, %f]",
+                                             one, three, output[0], output[1], dequantized[0],
+                                             dequantized[1]]];
+  });
+}
+
+- (void)invokeMultiAdd {
+  __weak typeof(self) weakSelf = self;
+  dispatch_async(self.interpreterSerialQueue, ^{
+    if (weakSelf.interpreter == nil) {
+      [weakSelf updateResultsText:kNilInterpreterError];
+      return;
+    }
+
+    NSArray<NSNumber *> *shape = @[@2];
+    NSError *error;
+
+    for (int i = 0; i < weakSelf.interpreter.inputTensorCount; ++i) {
+      if (![weakSelf.interpreter resizeInputTensorAtIndex:i toShape:shape error:&error]) {
+        [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                               error.localizedDescription]];
+        return;
+      }
+    }
+
+    if (![weakSelf.interpreter allocateTensorsWithError:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    NSString *inputs = @"";
+    for (int i = 0; i < weakSelf.interpreter.inputTensorCount; ++i) {
+      TFLTensor *inputTensor = [weakSelf.interpreter inputTensorAtIndex:i error:&error];
+      if (inputTensor == nil || error != nil) {
+        [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                               error.localizedDescription]];
+        return;
+      }
+
+      NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+      float input1 = (float)(i + 1);
+      float input2 = (float)(i + 2);
+      inputs = [NSString stringWithFormat:@"%@%@[%.1f, %.1f]", inputs,
+                                          (inputs.length == 0 ? @"[" : @", "), input1, input2];
+
+      [inputData appendBytes:&input1 length:sizeof(float)];
+      [inputData appendBytes:&input2 length:sizeof(float)];
+      if (![inputTensor copyData:inputData error:&error]) {
+        [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                               error.localizedDescription]];
+        return;
+      }
+    }
+    inputs = [NSString stringWithFormat:@"%@]", inputs];
+
+    if (![weakSelf.interpreter invokeWithError:&error]) {
+      [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                             error.localizedDescription]];
+      return;
+    }
+
+    NSString *outputs = @"";
+    for (int i = 0; i < weakSelf.interpreter.outputTensorCount; ++i) {
+      TFLTensor *outputTensor = [weakSelf.interpreter outputTensorAtIndex:i error:&error];
+      if (outputTensor == nil || error != nil) {
+        [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                               error.localizedDescription]];
+        return;
+      }
+
+      NSData *outputData = [outputTensor dataWithError:&error];
+      if (outputData == nil || error != nil) {
+        [weakSelf updateResultsText:[NSString stringWithFormat:kInvokeInterpreterError,
+                                                               error.localizedDescription]];
+        return;
+      }
+      float output[2U];
+      [outputData getBytes:output length:(sizeof(float) * 2U)];
+      outputs =
+          [NSString stringWithFormat:@"%@%@[%.1f, %.1f]", outputs,
+                                     (outputs.length == 0 ? @"[" : @", "), output[0], output[1]];
+    }
+    outputs = [NSString stringWithFormat:@"%@]", outputs];
+
+    [weakSelf
+        updateResultsText:
+            [NSString
+                stringWithFormat:@"Performing 3 add operations:\n\nInputs = %@\n\nOutputs = %@",
+                                 inputs, outputs]];
+  });
+}
+
+- (void)updateResultsText:(NSString *)text {
+  __weak typeof(self) weakSelf = self;
+  TLTSafeDispatchOnMain(^{
+    weakSelf.resultsTextView.text = text;
+  });
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/main.m b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/main.m
new file mode 100644
index 00000000000..b7ab1325e1b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/main.m
@@ -0,0 +1,22 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+#import "AppDelegate.h"
+
+int main(int argc, char* argv[]) {
+  @autoreleasepool {
+    return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class]));
+  }
+}
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index 1c8b7f976ec..5208d3cafdf 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -24,6 +24,9 @@
 
 NS_ASSUME_NONNULL_BEGIN
 
+FOUNDATION_EXPORT NSString *const TFLVersion =
+    TFL_Version() == NULL ? @"" : [NSString stringWithUTF8String:TFL_Version()];
+
 /**
  * Error reporter for TFLInterpreter.
  *
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
index eefa9b9f058..c75d082fe7d 100644
--- a/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
@@ -12,16 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h"
 
 #import <XCTest/XCTest.h>
 
-#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
-#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
-#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
-
 NS_ASSUME_NONNULL_BEGIN
 
+/**
+ * Regular expression for TensorFlow Lite runtime version string, e.g. "1.14.0", "0.1.2-alpha.1",
+ * "0.3.4-beta2", "1.14.0-rc.3".
+ */
+static NSString *const kTFLVersionRegex = @"^\\d+\\.\\d+\\.\\d+(-[a-zA-Z0-9.-]+)?$";
+
 /** Float model resource name. */
 static NSString *const kAddFloatModelResourceName = @"add";
 
@@ -31,9 +33,6 @@ static NSString *const kAddQuantizedModelResourceName = @"add_quantized";
 /** Model resource type. */
 static NSString *const kAddModelResourceType = @"bin";
 
-/** Rank of the input and output tensor in the Add model. */
-static const NSUInteger kAddModelTensorRank = 1U;
-
 /** Size of the first (and only) dimension of the input and output tensor in the Add model. */
 static const NSUInteger kAddModelTensorFirstDimensionSize = 2U;
 
@@ -91,10 +90,14 @@ static const float kTestAccuracy = 1E-5F;
 
 #pragma mark - Tests
 
+- (void)testTFLVersion {
+  NSRange range = [TFLVersion rangeOfString:kTFLVersionRegex options:NSRegularExpressionSearch];
+  XCTAssertNotEqual(range.location, NSNotFound);
+}
+
 - (void)testSuccessfulFullRunAddFloatModel {
   // Shape for both input and output tensor.
-  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
-  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+  NSArray<NSNumber *> *shape = @[@(kAddModelTensorFirstDimensionSize)];
 
   // Creates the interpreter options.
   TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
@@ -177,8 +180,7 @@ static const float kTestAccuracy = 1E-5F;
 
 - (void)testSuccessfulFullRunQuantizedModel {
   // Shape for both input and output tensor.
-  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
-  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+  NSArray<NSNumber *> *shape = @[@(kAddModelTensorFirstDimensionSize)];
 
   // Creates the interpreter options.
   TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
@@ -269,10 +271,6 @@ static const float kTestAccuracy = 1E-5F;
 }
 
 - (void)testInitWithModelPath_invalidPath {
-  // Shape for both input and output tensor.
-  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
-  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
-
   // Creates the interpreter.
   NSError *error;
   TFLInterpreter *brokenInterpreter = [[TFLInterpreter alloc] initWithModelPath:@"InvalidPath"
@@ -301,8 +299,7 @@ static const float kTestAccuracy = 1E-5F;
 }
 
 - (void)testResizeInputTensorAtIndex_invalidIndex {
-  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
-  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+  NSArray<NSNumber *> *shape = @[@(kAddModelTensorFirstDimensionSize)];
   NSError *error;
   XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:kInvalidInputTensorIndex
                                                     toShape:shape
@@ -318,8 +315,7 @@ static const float kTestAccuracy = 1E-5F;
 }
 
 - (void)testResizeInputTensorAtIndex_zeroDimensionSize {
-  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
-  shape[0] = [NSNumber numberWithUnsignedInteger:0];
+  NSArray<NSNumber *> *shape = @[@0];
   NSError *error;
   XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
   XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index d2a3634077f..a72a25ef67c 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -2,10 +2,9 @@
 
 # TODO(b/123403203) actually make TFLite use ruy.
 
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 load(":ruy_visibility.bzl", "ruy_visibility")
@@ -29,7 +28,6 @@ cc_library(
 cc_library(
     name = "spec",
     hdrs = ["spec.h"],
-    visibility = ruy_visibility(),
 )
 
 cc_library(
@@ -151,7 +149,6 @@ cc_library(
 cc_library(
     name = "path",
     hdrs = ["path.h"],
-    visibility = ruy_visibility(),
     deps = [":size_util"],
 )
 
@@ -194,7 +191,6 @@ cc_library(
 cc_library(
     name = "matrix",
     hdrs = ["matrix.h"],
-    visibility = ruy_visibility(),
     deps = [":check_macros"],
 )
 
@@ -211,8 +207,15 @@ cc_library(
 
 cc_library(
     name = "common",
-    hdrs = ["common.h"],
-    deps = [":check_macros"],
+    hdrs = [
+        "common.h",
+    ],
+    deps = [
+        ":check_macros",
+        ":matrix",
+        ":opt_set",
+        ":path",
+    ],
 )
 
 cc_library(
@@ -255,17 +258,33 @@ cc_library(
     ],
 )
 
+# The main library.
 cc_library(
-    name = "trmul",
-    srcs = ["trmul.cc"],
-    hdrs = ["trmul.h"],
+    name = "ruy",
+    srcs = [
+        "dispatch.h",
+        "prepack.h",
+        "trmul.cc",
+        "trmul.h",
+    ],
+    hdrs = [
+        "matrix.h",
+        "path.h",
+        "ruy.h",
+        "ruy_advanced.h",
+    ],
+    visibility = ruy_visibility(),
     deps = [
         ":allocator",
         ":block_map",
+        ":check_macros",
         ":common",
         ":context",
         ":internal_matrix",
+        ":kernel",
         ":opt_set",
+        ":pack",
+        ":size_util",
         ":spec",
         ":thread_pool",
         ":trace",
@@ -274,50 +293,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "dispatch",
-    hdrs = ["dispatch.h"],
-    deps = [
-        ":kernel",
-        ":pack",
-        ":spec",
-        ":trmul",
-        "@gemmlowp//:profiler",
-    ],
-)
-
-cc_library(
-    name = "prepack",
-    hdrs = ["prepack.h"],
-    deps = [
-        ":dispatch",
-    ],
-)
-
-# Ruy's advanced API.
-cc_library(
-    name = "ruy_advanced",
-    hdrs = [
-        "ruy_advanced.h",
-    ],
-    visibility = ruy_visibility(),
-    deps = [
-        ":prepack",
-    ],
-)
-
-# Ruy's main API.
-cc_library(
-    name = "ruy",
-    hdrs = [
-        "ruy.h",
-    ],
-    visibility = ruy_visibility(),
-    deps = [
-        ":dispatch",
-    ],
-)
-
 # Usage examples.
 cc_binary(
     name = "example",
@@ -332,7 +307,7 @@ cc_binary(
     name = "example_advanced",
     srcs = ["example_advanced.cc"],
     deps = [
-        ":ruy_advanced",
+        ":ruy",
     ],
 )
 
@@ -361,7 +336,6 @@ cc_library(
     deps = [
         ":pmu",
         ":ruy",
-        ":ruy_advanced",
         ":time",
         "@com_google_googletest//:gtest",
     ] + ruy_test_ext_deps(),
diff --git a/tensorflow/lite/experimental/ruy/common.h b/tensorflow/lite/experimental/ruy/common.h
index d2384e70a0e..3f6e8ac25f5 100644
--- a/tensorflow/lite/experimental/ruy/common.h
+++ b/tensorflow/lite/experimental/ruy/common.h
@@ -23,6 +23,9 @@ limitations under the License.
 #include <type_traits>
 
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/opt_set.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
 
 #ifdef __aarch64__
 #include <arm_neon.h>
diff --git a/tensorflow/lite/experimental/ruy/detect_dotprod.cc b/tensorflow/lite/experimental/ruy/detect_dotprod.cc
index fdafd199e33..d08121f85e5 100644
--- a/tensorflow/lite/experimental/ruy/detect_dotprod.cc
+++ b/tensorflow/lite/experimental/ruy/detect_dotprod.cc
@@ -13,7 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef __aarch64__
+// b/132973173: this signal-handling code does not work as intended on iOS,
+// resulting in 'EXC_BAD_INSTRUCTION' signals killing the process.
+// Is it because this code uses a signal handler for SIGILL, and Apple's
+// EXC_BAD_INSTRUCTION is actually a different signal?
+// Anyway, we don't need this code on Apple devices at the moment, as none of
+// them supports dot-product instructions at the moment.
+// In fact, for the moment, we only care about Linux, so restricting to it
+// limits our risk.
+#if defined __aarch64__ && defined __linux__
+#define RUY_IMPLEMENT_DETECT_DOTPROD
+#endif
+
+#ifdef RUY_IMPLEMENT_DETECT_DOTPROD
 
 #include <setjmp.h>
 #include <signal.h>
@@ -28,7 +40,7 @@ limitations under the License.
 
 namespace ruy {
 
-#ifdef __aarch64__
+#ifdef RUY_IMPLEMENT_DETECT_DOTPROD
 
 namespace {
 
diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h
index dae76fc4c73..3e3b29707b0 100644
--- a/tensorflow/lite/experimental/ruy/dispatch.h
+++ b/tensorflow/lite/experimental/ruy/dispatch.h
@@ -36,8 +36,9 @@ limitations under the License.
 #include <limits>
 
 #include "profiling/instrumentation.h"
-#include "tensorflow/lite/experimental/ruy/kernel.h"
-#include "tensorflow/lite/experimental/ruy/pack.h"
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/context.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
 #include "tensorflow/lite/experimental/ruy/spec.h"
 #include "tensorflow/lite/experimental/ruy/trmul.h"
 
diff --git a/tensorflow/lite/experimental/ruy/kernel.cc b/tensorflow/lite/experimental/ruy/kernel.cc
index 969c2b3f50f..10f71d04e3a 100644
--- a/tensorflow/lite/experimental/ruy/kernel.cc
+++ b/tensorflow/lite/experimental/ruy/kernel.cc
@@ -2701,9 +2701,10 @@ void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) {
         "ldr q9, [x1]\n"
         "ldr q10, [x1, #16]\n"
 
+        "tst w6, #" RUY_STR(RUY_ASM_FLAG_NEEDS_LEFT_SHIFT) "\n"
+        "beq 403f\n"
         "smax v11.4s, v9.4s, v8.4s\n"
         "smax v12.4s, v10.4s, v8.4s\n"
-
         "sshl v16.4s, v16.4s, v11.4s\n"
         "sshl v17.4s, v17.4s, v12.4s\n"
         "sshl v18.4s, v18.4s, v11.4s\n"
@@ -2720,6 +2721,7 @@ void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params) {
         "sshl v29.4s, v29.4s, v12.4s\n"
         "sshl v30.4s, v30.4s, v11.4s\n"
         "sshl v31.4s, v31.4s, v12.4s\n"
+        "403:\n"
 
         "ldr q14, [x4]\n" // multiplier_fixedpoint
         "ldr q15, [x4, #16]\n" // multiplier_fixedpoint
@@ -3874,30 +3876,19 @@ void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params) {
         "ldr q9, [x1]\n"
         "ldr q10, [x1, #16]\n"
 
+        "tst w6, #" RUY_STR(RUY_ASM_FLAG_NEEDS_LEFT_SHIFT) "\n"
+        "beq 403f\n"
         "smax v11.4s, v9.4s, v8.4s\n"
         "smax v12.4s, v10.4s, v8.4s\n"
-
-        // Now that we know what LHS and RHS data the next iteration of the
-        // main loop will need to load, we start loading the first 32 bytes of
-        // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
-        // in the rest of the work on the current block.
-        "ld1 {v0.8b}, [%[lhs_ptr]], #8\n"
         "sshl v16.4s, v16.4s, v11.4s\n"
-        "ldr x1, [%[lhs_ptr]], #8\n"
         "sshl v17.4s, v17.4s, v12.4s\n"
-        "ld1 {v1.8b}, [%[lhs_ptr]], #8\n"
         "sshl v18.4s, v18.4s, v11.4s\n"
         "sshl v19.4s, v19.4s, v12.4s\n"
-        "ldr x2, [%[lhs_ptr]], #8\n"
         "sshl v20.4s, v20.4s, v11.4s\n"
-        "ld1 {v2.8b}, [%[rhs_ptr]], #8\n"
         "sshl v21.4s, v21.4s, v12.4s\n"
         "sshl v22.4s, v22.4s, v11.4s\n"
-        "ldr x5, [%[rhs_ptr]], #8\n"
         "sshl v23.4s, v23.4s, v12.4s\n"
-        "ld1 {v3.8b}, [%[rhs_ptr]], #8\n"
         "sshl v24.4s, v24.4s, v11.4s\n"
-        "ldr x6, [%[rhs_ptr]], #8\n"
         "sshl v25.4s, v25.4s, v12.4s\n"
         "sshl v26.4s, v26.4s, v11.4s\n"
         "sshl v27.4s, v27.4s, v12.4s\n"
@@ -3905,6 +3896,7 @@ void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params) {
         "sshl v29.4s, v29.4s, v12.4s\n"
         "sshl v30.4s, v30.4s, v11.4s\n"
         "sshl v31.4s, v31.4s, v12.4s\n"
+        "403:\n"
 
         "ldr q14, [x4]\n" // multiplier_fixedpoint
         "ldr q15, [x4, #16]\n" // multiplier_fixedpoint
@@ -3913,13 +3905,27 @@ void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params) {
         "smin v12.4s, v10.4s, v8.4s\n"
 
         // Apply the fixed-point part of the multiplier.
+        //
+        // ... and, interleaved into that:
+        // Now that we know what LHS and RHS data the next iteration of the
+        // main loop will need to load, we start loading the first 32 bytes of
+        // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
+        // in the rest of the work on the current block.
+        "ld1 {v0.8b}, [%[lhs_ptr]], #8\n"
         "sqrdmulh v16.4s, v16.4s, v14.4s\n"
+        "ldr x1, [%[lhs_ptr]], #8\n"
         "sqrdmulh v17.4s, v17.4s, v15.4s\n"
+        "ld1 {v1.8b}, [%[lhs_ptr]], #8\n"
         "sqrdmulh v18.4s, v18.4s, v14.4s\n"
+        "ldr x2, [%[lhs_ptr]], #8\n"
         "sqrdmulh v19.4s, v19.4s, v15.4s\n"
+        "ld1 {v2.8b}, [%[rhs_ptr]], #8\n"
         "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+        "ldr x5, [%[rhs_ptr]], #8\n"
         "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+        "ld1 {v3.8b}, [%[rhs_ptr]], #8\n"
         "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+        "ldr x6, [%[rhs_ptr]], #8\n"
         "sqrdmulh v23.4s, v23.4s, v15.4s\n"
         "sqrdmulh v24.4s, v24.4s, v14.4s\n"
         "sqrdmulh v25.4s, v25.4s, v15.4s\n"
diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h
index 41ead0fe2a5..f3974f2af30 100644
--- a/tensorflow/lite/experimental/ruy/kernel.h
+++ b/tensorflow/lite/experimental/ruy/kernel.h
@@ -85,6 +85,11 @@ void RunKernel(Tuning tuning, const PMatrix& lhs, const PMatrix& rhs,
       &mdst);
 }
 
+// The signature of RunKernel is the same, regardless of template parameters.
+using RunKernelFn =
+    decltype(RunKernel<Path::kStandardCpp, std::int8_t, std::int8_t,
+                       std::int8_t, BasicSpec<std::int32_t, std::int8_t>>);
+
 // Copied from TF Lite code.
 inline std::int32_t MultiplyByQuantizedMultiplier(
     std::int32_t x, std::int32_t quantized_multiplier, int shift) {
@@ -218,6 +223,7 @@ RUY_INHERIT_KERNEL(Path::kNeon, Path::kNeonDotprod)
 #define RUY_ASM_FLAG_HAS_LHS_SUMS 0x2
 #define RUY_ASM_FLAG_HAS_RHS_SUMS 0x4
 #define RUY_ASM_FLAG_HAS_PERCHANNEL 0x8
+#define RUY_ASM_FLAG_NEEDS_LEFT_SHIFT 0x10
 
 #define RUY_ASM_TYPE_ID_UINT8 1
 #define RUY_ASM_TYPE_ID_INT8 2
@@ -329,10 +335,14 @@ void MakeKernelParams8bit(const PackedMatrix<std::int8_t>& lhs,
   params->depth = depth;
   params->prod_zp_depth = lhs.zero_point * rhs.zero_point * depth;
   if (spec.multiplier_fixedpoint_perchannel) {
+    params->flags |= RUY_ASM_FLAG_NEEDS_LEFT_SHIFT;
     params->flags |= RUY_ASM_FLAG_HAS_PERCHANNEL;
     params->multiplier_fixedpoint = spec.multiplier_fixedpoint_perchannel;
     params->multiplier_exponent = spec.multiplier_exponent_perchannel;
   } else {
+    if (spec.multiplier_exponent > 0) {
+      params->flags |= RUY_ASM_FLAG_NEEDS_LEFT_SHIFT;
+    }
     params->multiplier_fixedpoint = params->multiplier_fixedpoint_buf;
     params->multiplier_exponent = params->multiplier_exponent_buf;
     for (int i = 0; i < LhsCols; i++) {
diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h
index 0ef439da8c1..310fcb086a6 100644
--- a/tensorflow/lite/experimental/ruy/pack.h
+++ b/tensorflow/lite/experimental/ruy/pack.h
@@ -89,7 +89,6 @@ limitations under the License.
 #include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/internal_matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
-#include "tensorflow/lite/experimental/ruy/path.h"
 #include "tensorflow/lite/experimental/ruy/tune.h"
 
 namespace ruy {
@@ -400,6 +399,11 @@ void RunPack(Tuning tuning, const DMatrix& src_matrix, PMatrix* packed_matrix,
       tuning, src, &packed, start_col, end_col);
 }
 
+// The signature of RunPack is the same, regardless of its template parameters.
+using RunPackFn = decltype(
+    RunPack<Path::kStandardCpp, FixedKernelLayout<Order::kColMajor, 1, 1>,
+            std::int8_t, std::int8_t>);
+
 }  // namespace ruy
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
diff --git a/tensorflow/lite/experimental/ruy/path.h b/tensorflow/lite/experimental/ruy/path.h
index fd0c4a44cb2..2766d58a209 100644
--- a/tensorflow/lite/experimental/ruy/path.h
+++ b/tensorflow/lite/experimental/ruy/path.h
@@ -98,9 +98,14 @@ inline Path GetMostSignificantPath(Path path_mask) {
 // ruy::kAllPaths represents all Path's that make sense to on a given
 // base architecture.
 #ifdef __aarch64__
+#ifdef __linux__
 constexpr Path kAllPaths =
     Path::kReference | Path::kStandardCpp | Path::kNeon | Path::kNeonDotprod;
 #else
+// We don't know how to do runtime dotprod detection outside of linux for now.
+constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
+#endif
+#else
 constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
 #endif
 
diff --git a/tensorflow/lite/experimental/ruy/prepack.h b/tensorflow/lite/experimental/ruy/prepack.h
index 8532698b8e1..9019efa5de6 100644
--- a/tensorflow/lite/experimental/ruy/prepack.h
+++ b/tensorflow/lite/experimental/ruy/prepack.h
@@ -20,7 +20,12 @@ limitations under the License.
 
 #include <functional>
 
+#include "tensorflow/lite/experimental/ruy/context.h"
 #include "tensorflow/lite/experimental/ruy/dispatch.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/path.h"
+#include "tensorflow/lite/experimental/ruy/spec.h"
+#include "tensorflow/lite/experimental/ruy/tune.h"
 
 namespace ruy {
 
diff --git a/tensorflow/lite/experimental/ruy/ruy.h b/tensorflow/lite/experimental/ruy/ruy.h
index 2c2b624c4e7..d9f88f6575a 100644
--- a/tensorflow/lite/experimental/ruy/ruy.h
+++ b/tensorflow/lite/experimental/ruy/ruy.h
@@ -18,7 +18,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_RUY_H_
 
+#include "tensorflow/lite/experimental/ruy/context.h"
 #include "tensorflow/lite/experimental/ruy/dispatch.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/spec.h"
 
 namespace ruy {
 
diff --git a/tensorflow/lite/experimental/ruy/trmul.cc b/tensorflow/lite/experimental/ruy/trmul.cc
index aa7c458209a..c2dcfcf1810 100644
--- a/tensorflow/lite/experimental/ruy/trmul.cc
+++ b/tensorflow/lite/experimental/ruy/trmul.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/lite/experimental/ruy/block_map.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
-#include "tensorflow/lite/experimental/ruy/spec.h"
 #include "tensorflow/lite/experimental/ruy/thread_pool.h"
 #include "tensorflow/lite/experimental/ruy/trace.h"
 
diff --git a/tensorflow/lite/experimental/ruy/trmul.h b/tensorflow/lite/experimental/ruy/trmul.h
index a628bdf8478..64b43aca37f 100644
--- a/tensorflow/lite/experimental/ruy/trmul.h
+++ b/tensorflow/lite/experimental/ruy/trmul.h
@@ -28,19 +28,12 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/ruy/context.h"
 #include "tensorflow/lite/experimental/ruy/internal_matrix.h"
+#include "tensorflow/lite/experimental/ruy/kernel.h"
+#include "tensorflow/lite/experimental/ruy/pack.h"
 #include "tensorflow/lite/experimental/ruy/tune.h"
 
 namespace ruy {
 
-// The signature of RunKernel is the same, regardless of template parameters.
-// See kernel.h for the templated declaration of the function.
-using RunKernelFn = void(Tuning, const PMatrix&, const PMatrix&, void*, int,
-                         int, int, int, DMatrix*);
-
-// The signature of RunPack is the same, regardless of its template parameters.
-// See pack.h for the templated declaration of the function.
-using RunPackFn = void(Tuning, const DMatrix&, PMatrix*, int, int);
-
 // Type-erased data needed for implementing TrMul.
 struct TrMulParams {
   // Helper functions for invoking the function pointers.
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index a5a0c94e5b4..7e0bf94dfca 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -1,8 +1,9 @@
 # TensorFlow Lite for Swift
 
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
@@ -19,37 +20,17 @@ swift_library(
 )
 
 ios_unit_test(
-    name = "TensorFlowLiteTests",
+    name = "Tests",
     size = "small",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     tags = TFL_DEFAULT_TAGS + TFL_DISABLED_SANITIZER_TAGS,
     deps = [
-        ":TestsLib",
-    ],
-)
-
-ios_application(
-    name = "TensorFlowLiteApp",
-    app_icons = glob(["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/**"]),
-    bundle_id = "com.tensorflow.lite.swift.TensorFlowLite",
-    families = [
-        "ipad",
-        "iphone",
-    ],
-    infoplists = ["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist"],
-    launch_storyboard = "TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard",
-    minimum_os_version = TFL_MINIMUM_OS_VERSION,
-    sdk_frameworks = [
-        "CoreGraphics",
-    ],
-    tags = TFL_DEFAULT_TAGS,
-    deps = [
-        ":AppLib",
+        ":TestsLibrary",
     ],
 )
 
 swift_library(
-    name = "TestsLib",
+    name = "TestsLibrary",
     testonly = 1,
     srcs = glob(["Tests/*.swift"]),
     tags = TFL_DEFAULT_TAGS,
@@ -59,13 +40,33 @@ swift_library(
     ],
 )
 
+ios_application(
+    name = "TestApp",
+    app_icons = glob(["TestApp/TestApp/Assets.xcassets/AppIcon.appiconset/**"]),
+    bundle_id = "com.tensorflow.lite.swift.TestApp",
+    families = [
+        "ipad",
+        "iphone",
+    ],
+    infoplists = ["TestApp/TestApp/Info.plist"],
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    sdk_frameworks = [
+        "CoreGraphics",
+    ],
+    tags = TFL_DEFAULT_TAGS,
+    deps = [
+        ":TestAppLibrary",
+    ],
+)
+
 swift_library(
-    name = "AppLib",
-    srcs = glob(["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/*.swift"]),
-    module_name = "AppLib",
+    name = "TestAppLibrary",
+    srcs = glob(["TestApp/TestApp/*.swift"]),
+    data = glob(["TestApp/TestApp/Base.lproj/*.storyboard"]),
+    module_name = "TestApp",
     tags = TFL_DEFAULT_TAGS + ["manual"],
     deps = [
-        ":AppResources",
+        ":Resources",
         ":TensorFlowLite",
     ],
 )
@@ -79,14 +80,3 @@ objc_library(
     ],
     tags = TFL_DEFAULT_TAGS,
 )
-
-objc_library(
-    name = "AppResources",
-    data = glob([
-        "TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/*.storyboard",
-    ]),
-    tags = TFL_DEFAULT_TAGS + ["manual"],
-    deps = [
-        ":Resources",
-    ],
-)
diff --git a/tensorflow/lite/experimental/swift/README.md b/tensorflow/lite/experimental/swift/README.md
index 3e5badf448a..786efbcf366 100644
--- a/tensorflow/lite/experimental/swift/README.md
+++ b/tensorflow/lite/experimental/swift/README.md
@@ -59,10 +59,10 @@ Build the `TensorFlowLite` Swift library target:
 bazel build tensorflow/lite/experimental/swift:TensorFlowLite
 ```
 
-Build the `TensorFlowLiteTests` target:
+Build the `Tests` target:
 
 ```shell
-bazel test tensorflow/lite/experimental/swift:TensorFlowLiteTests --swiftcopt=-enable-testing
+bazel test tensorflow/lite/experimental/swift:Tests --swiftcopt=-enable-testing
 ```
 
 Note: `--swiftcopt=-enable-testing` is required for optimized builds (`-c opt`).
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index c32f67c8b52..457ca4128f6 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -54,7 +54,7 @@ public final class Interpreter {
       TFL_InterpreterOptionsSetErrorReporter(
         cOptions,
         { (_, format, args) -> Void in
-          // Workaround for Swift optionality bug: https://bugs.swift.org/browse/SR-3429.
+          // Workaround for optionality differences for x86_64 (non-optional) and arm64 (optional).
           let optionalArgs: CVaListPointer? = args
           guard let cFormat = format,
             let arguments = optionalArgs,
diff --git a/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift b/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift
new file mode 100644
index 00000000000..edc30cfca9c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/TensorFlowLite.swift
@@ -0,0 +1,22 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import TensorFlowLiteC
+
+/// TensorFlow Lite runtime values.
+public enum Runtime {
+  /// A string describing the semantic versioning information for the runtime. Is an empty string if
+  /// the version could not be determined.
+  public static var version: String { return TFL_Version().map { String(cString: $0) } ?? "" }
+}
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
index 16bc6cbfe8f..7ad7e33cf09 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
+++ b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
@@ -3,14 +3,14 @@
     "tensorflow/lite/experimental/c",
     "tensorflow/lite/experimental/swift",
     "tensorflow/lite/experimental/swift/Sources",
-    "tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp",
-    "tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj",
+    "tensorflow/lite/experimental/swift/TestApp/TestApp",
+    "tensorflow/lite/experimental/swift/TestApp/TestApp/Base.lproj",
     "tensorflow/lite/experimental/swift/Tests",
   ],
   "buildTargets" : [
     "//tensorflow/lite/experimental/swift:TensorFlowLite",
-    "//tensorflow/lite/experimental/swift:TensorFlowLiteApp",
-    "//tensorflow/lite/experimental/swift:TensorFlowLiteTests",
+    "//tensorflow/lite/experimental/swift:TestApp",
+    "//tensorflow/lite/experimental/swift:Tests",
   ],
   "projectName" : "TensorFlowLite",
   "optionSet" : {
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/Podfile b/tensorflow/lite/experimental/swift/TestApp/Podfile
similarity index 70%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/Podfile
rename to tensorflow/lite/experimental/swift/TestApp/Podfile
index 9c9fe28ab08..2cd46d56689 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/Podfile
+++ b/tensorflow/lite/experimental/swift/TestApp/Podfile
@@ -1,6 +1,6 @@
 platform :ios, '9.0'
 
-target 'TensorFlowLiteApp' do
+target 'TestApp' do
   use_frameworks!
   pod 'TensorFlowLiteSwift'
 end
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj b/tensorflow/lite/experimental/swift/TestApp/TestApp.xcodeproj/project.pbxproj
similarity index 91%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
rename to tensorflow/lite/experimental/swift/TestApp/TestApp.xcodeproj/project.pbxproj
index eb5d65a1f87..e6723e5a9a2 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp.xcodeproj/project.pbxproj
@@ -20,11 +20,11 @@
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
-		4A1E2B9D227C8B51006C23E2 /* multi_add.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = multi_add.bin; path = ../../../../../testdata/multi_add.bin; sourceTree = "<group>"; };
-		4A1E2B9E227C8B52006C23E2 /* add_quantized.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_quantized.bin; path = ../../../../../testdata/add_quantized.bin; sourceTree = "<group>"; };
-		4A1E2B9F227C8B52006C23E2 /* add.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add.bin; path = ../../../../../testdata/add.bin; sourceTree = "<group>"; };
+		4A1E2B9D227C8B51006C23E2 /* multi_add.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = multi_add.bin; path = ../../../../testdata/multi_add.bin; sourceTree = "<group>"; };
+		4A1E2B9E227C8B52006C23E2 /* add_quantized.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add_quantized.bin; path = ../../../../testdata/add_quantized.bin; sourceTree = "<group>"; };
+		4A1E2B9F227C8B52006C23E2 /* add.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = add.bin; path = ../../../../testdata/add.bin; sourceTree = "<group>"; };
 		4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Data+TensorFlowLite.swift"; sourceTree = "<group>"; };
-		4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TensorFlowLiteApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		4AA72B6F2146ED64006C3AEF /* TestApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TestApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		4AA72B722146ED64006C3AEF /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
 		4AA72B742146ED64006C3AEF /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
 		4AA72B772146ED64006C3AEF /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
@@ -48,7 +48,7 @@
 		4AA72B662146ED64006C3AEF = {
 			isa = PBXGroup;
 			children = (
-				4AA72B712146ED64006C3AEF /* TensorFlowLiteApp */,
+				4AA72B712146ED64006C3AEF /* TestApp */,
 				4AA72B702146ED64006C3AEF /* Products */,
 			);
 			sourceTree = "<group>";
@@ -56,12 +56,12 @@
 		4AA72B702146ED64006C3AEF /* Products */ = {
 			isa = PBXGroup;
 			children = (
-				4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */,
+				4AA72B6F2146ED64006C3AEF /* TestApp.app */,
 			);
 			name = Products;
 			sourceTree = "<group>";
 		};
-		4AA72B712146ED64006C3AEF /* TensorFlowLiteApp */ = {
+		4AA72B712146ED64006C3AEF /* TestApp */ = {
 			isa = PBXGroup;
 			children = (
 				4AA72B722146ED64006C3AEF /* AppDelegate.swift */,
@@ -76,15 +76,15 @@
 				4A1E2B9F227C8B52006C23E2 /* add.bin */,
 				4A1E2B9D227C8B51006C23E2 /* multi_add.bin */,
 			);
-			path = TensorFlowLiteApp;
+			path = TestApp;
 			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
 
 /* Begin PBXNativeTarget section */
-		4AA72B6E2146ED64006C3AEF /* TensorFlowLiteApp */ = {
+		4AA72B6E2146ED64006C3AEF /* TestApp */ = {
 			isa = PBXNativeTarget;
-			buildConfigurationList = 4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TensorFlowLiteApp" */;
+			buildConfigurationList = 4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TestApp" */;
 			buildPhases = (
 				4AA72B6B2146ED64006C3AEF /* Sources */,
 				4AA72B6C2146ED64006C3AEF /* Frameworks */,
@@ -94,9 +94,9 @@
 			);
 			dependencies = (
 			);
-			name = TensorFlowLiteApp;
-			productName = TensorFlowLiteApp;
-			productReference = 4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */;
+			name = TestApp;
+			productName = TestApp;
+			productReference = 4AA72B6F2146ED64006C3AEF /* TestApp.app */;
 			productType = "com.apple.product-type.application";
 		};
 /* End PBXNativeTarget section */
@@ -115,7 +115,7 @@
 					};
 				};
 			};
-			buildConfigurationList = 4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TensorFlowLiteApp" */;
+			buildConfigurationList = 4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TestApp" */;
 			compatibilityVersion = "Xcode 9.3";
 			developmentRegion = en;
 			hasScannedForEncodings = 0;
@@ -128,7 +128,7 @@
 			projectDirPath = "";
 			projectRoot = "";
 			targets = (
-				4AA72B6E2146ED64006C3AEF /* TensorFlowLiteApp */,
+				4AA72B6E2146ED64006C3AEF /* TestApp */,
 			);
 		};
 /* End PBXProject section */
@@ -302,12 +302,12 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
-				INFOPLIST_FILE = TensorFlowLiteApp/Info.plist;
+				INFOPLIST_FILE = TestApp/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TestApp;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -319,12 +319,12 @@
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
-				INFOPLIST_FILE = TensorFlowLiteApp/Info.plist;
+				INFOPLIST_FILE = TestApp/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TestApp;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -334,7 +334,7 @@
 /* End XCBuildConfiguration section */
 
 /* Begin XCConfigurationList section */
-		4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TensorFlowLiteApp" */ = {
+		4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TestApp" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				4AA72B7F2146ED66006C3AEF /* Debug */,
@@ -343,7 +343,7 @@
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
-		4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TensorFlowLiteApp" */ = {
+		4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TestApp" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				4AA72B822146ED66006C3AEF /* Debug */,
diff --git a/tensorflow/lite/experimental/swift/TestApp/TestApp/AppDelegate.swift b/tensorflow/lite/experimental/swift/TestApp/TestApp/AppDelegate.swift
new file mode 100644
index 00000000000..d34b27a5c84
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/AppDelegate.swift
@@ -0,0 +1,30 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import UIKit
+
+@UIApplicationMain
+
+final class AppDelegate: UIResponder, UIApplicationDelegate {
+
+  /// The main window of the app.
+  var window: UIWindow?
+
+  func application(
+    _ application: UIApplication,
+    didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]? = nil
+  ) -> Bool {
+    return true
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApp/TestApp/Array+TensorFlowLite.swift
similarity index 62%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
rename to tensorflow/lite/experimental/swift/TestApp/TestApp/Array+TensorFlowLite.swift
index e9fb026bb7b..e2853e153eb 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/Array+TensorFlowLite.swift
@@ -1,3 +1,17 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 import Foundation
 
 extension Array {
diff --git a/tensorflow/lite/experimental/swift/TestApp/TestApp/Assets.xcassets/AppIcon.appiconset/Contents.json b/tensorflow/lite/experimental/swift/TestApp/TestApp/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 00000000000..d8db8d65fd7
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,98 @@
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/swift/TestApp/TestApp/Assets.xcassets/Contents.json b/tensorflow/lite/experimental/swift/TestApp/TestApp/Assets.xcassets/Contents.json
new file mode 100644
index 00000000000..da4a164c918
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard b/tensorflow/lite/experimental/swift/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard
similarity index 77%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard
rename to tensorflow/lite/experimental/swift/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard
index a07a1321be2..83d172a2309 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/Base.lproj/LaunchScreen.storyboard
@@ -1,11 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14109" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14490.70" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
     <device id="retina4_7" orientation="portrait">
         <adaptation id="fullscreen"/>
     </device>
     <dependencies>
-        <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14088"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14490.49"/>
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
     <scenes>
@@ -21,10 +20,10 @@
                         <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                         <subviews>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="TensorFlowLite" textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="3Gq-PV-hia">
-                                <rect key="frame" x="16" y="315" width="343" height="38.5"/>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="TensorFlow Lite" textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="3Gq-PV-hia">
+                                <rect key="frame" x="16" y="314" width="343" height="39"/>
                                 <fontDescription key="fontDescription" type="boldSystem" pointSize="32"/>
-                                <nil key="textColor"/>
+                                <color key="textColor" red="1" green="0.43529411764705883" blue="0.0" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
                                 <nil key="highlightedColor"/>
                             </label>
                         </subviews>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard b/tensorflow/lite/experimental/swift/TestApp/TestApp/Base.lproj/Main.storyboard
similarity index 98%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
rename to tensorflow/lite/experimental/swift/TestApp/TestApp/Base.lproj/Main.storyboard
index 5eab6d71a0b..a93a4f0272e 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/Base.lproj/Main.storyboard
@@ -12,7 +12,7 @@
         <!--View Controller-->
         <scene sceneID="tne-QT-ifu">
             <objects>
-                <viewController storyboardIdentifier="viewController" useStoryboardIdentifierAsRestorationIdentifier="YES" id="BYZ-38-t0r" customClass="ViewController" customModule="TensorFlowLiteApp" customModuleProvider="target" sceneMemberID="viewController">
+                <viewController storyboardIdentifier="viewController" useStoryboardIdentifierAsRestorationIdentifier="YES" id="BYZ-38-t0r" customClass="ViewController" customModule="TestApp" customModuleProvider="target" sceneMemberID="viewController">
                     <layoutGuides>
                         <viewControllerLayoutGuide type="top" id="y3c-jy-aDJ"/>
                         <viewControllerLayoutGuide type="bottom" id="wfy-db-euE"/>
diff --git a/tensorflow/lite/experimental/swift/TestApp/TestApp/Data+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApp/TestApp/Data+TensorFlowLite.swift
new file mode 100644
index 00000000000..0a845f6b354
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/Data+TensorFlowLite.swift
@@ -0,0 +1,27 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+extension Data {
+  /// Creates a new buffer by copying the buffer pointer of the given array.
+  ///
+  /// - Warning: The given array's element type `T` must be trivial in that it can be copied bit
+  ///     for bit with no indirection or reference-counting operations; otherwise, reinterpreting
+  ///     data from the resulting buffer has undefined behavior.
+  /// - Parameter array: An array with elements of type `T`.
+  init<T>(copyingBufferOf array: [T]) {
+    self = array.withUnsafeBufferPointer(Data.init)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist b/tensorflow/lite/experimental/swift/TestApp/TestApp/Info.plist
similarity index 100%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist
rename to tensorflow/lite/experimental/swift/TestApp/TestApp/Info.plist
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift b/tensorflow/lite/experimental/swift/TestApp/TestApp/ViewController.swift
similarity index 90%
rename from tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift
rename to tensorflow/lite/experimental/swift/TestApp/TestApp/ViewController.swift
index b7641744a31..7041930a38e 100644
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift
+++ b/tensorflow/lite/experimental/swift/TestApp/TestApp/ViewController.swift
@@ -1,4 +1,22 @@
-import TensorFlowLite
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import class TensorFlowLite.Interpreter
+import struct TensorFlowLite.InterpreterOptions
+import struct TensorFlowLite.Tensor
+import struct TensorFlowLite.TensorShape
+import enum TensorFlowLite.Runtime
 import UIKit
 
 class ViewController: UIViewController {
@@ -66,6 +84,7 @@ class ViewController: UIViewController {
     super.viewDidLoad()
 
     invokeButton.isEnabled = false
+    updateResultsText("Using TensorFlow Lite runtime version \(TensorFlowLite.Runtime.version).")
     loadModel()
   }
 
@@ -103,7 +122,9 @@ class ViewController: UIViewController {
   private func setUpInterpreter(withModelPath modelPath: String) {
     interpreterQueue.async {
       do {
-        self.interpreter = try Interpreter(modelPath: modelPath)
+        var options = InterpreterOptions()
+        options.threadCount = 2
+        self.interpreter = try Interpreter(modelPath: modelPath, options: options)
       } catch let error {
         self.updateResultsText(
           "Failed to create the interpreter with error: \(error.localizedDescription)"
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
deleted file mode 100644
index 45fd69716df..00000000000
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
+++ /dev/null
@@ -1,16 +0,0 @@
-import UIKit
-
-@UIApplicationMain
-
-final class AppDelegate: UIResponder, UIApplicationDelegate {
-
-  /// The main window of the app.
-  var window: UIWindow?
-
-  func application(
-    _ application: UIApplication,
-    didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]? = nil
-  ) -> Bool {
-    return true
-  }
-}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift
deleted file mode 100644
index bc8a70c8483..00000000000
--- a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift
+++ /dev/null
@@ -1,13 +0,0 @@
-import Foundation
-
-extension Data {
-  /// Creates a new buffer by copying the buffer pointer of the given array.
-  ///
-  /// - Warning: The given array's element type `T` must be trivial in that it can be copied bit
-  ///     for bit with no indirection or reference-counting operations; otherwise, reinterpreting
-  ///     data from the resulting buffer has undefined behavior.
-  /// - Parameter array: An array with elements of type `T`.
-  init<T>(copyingBufferOf array: [T]) {
-    self = array.withUnsafeBufferPointer(Data.init)
-  }
-}
diff --git a/tensorflow/lite/experimental/swift/Tests/TensorFlowLiteTests.swift b/tensorflow/lite/experimental/swift/Tests/TensorFlowLiteTests.swift
new file mode 100644
index 00000000000..191f802dcc4
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/TensorFlowLiteTests.swift
@@ -0,0 +1,28 @@
+// Copyright 2019 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class TensorFlowLiteTests: XCTestCase {
+
+  func testTensorFlowLite_Runtime_version() {
+    #if swift(>=5.0)
+    let pattern = #"^(\d+)\.(\d+)\.(\d+)([+-][-.0-9A-Za-z]+)?$"#
+    #else
+    let pattern = "^(\\d+)\\.(\\d+)\\.(\\d+)([+-][-.0-9A-Za-z]+)?$"
+    #endif  // swift(>=5.0)
+    XCTAssertNotNil(TensorFlowLite.Runtime.version.range(of: pattern, options: .regularExpression))
+  }
+}
diff --git a/tensorflow/lite/experimental/tensorboard/BUILD b/tensorflow/lite/experimental/tensorboard/BUILD
index 69983649b3f..af2ed21008d 100644
--- a/tensorflow/lite/experimental/tensorboard/BUILD
+++ b/tensorflow/lite/experimental/tensorboard/BUILD
@@ -1,8 +1,9 @@
 # TFLite modules to support TensorBoard plugin.
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 py_library(
     name = "ops_util",
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
index 9ba74d0e911..34d5c68c490 100644
--- a/tensorflow/lite/experimental/writer/BUILD
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -3,10 +3,9 @@ package(
         "//visibility:public",
     ],
     features = ["-parse_headers"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 cc_binary(
     name = "option_writer_generator",
     srcs = ["option_writer_generator.cc"],
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 3c1b4841806..623efb7f330 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -32,12 +32,12 @@ upper_tabs:
       - heading: "Convert a model"
       - title: "TensorFlow Lite converter"
         path: /lite/convert/
+      - title: "Python API"
+        path: /lite/convert/python_api
       - title: "Command line examples"
         path: /lite/convert/cmdline_examples
       - title: "Command line reference"
         path: /lite/convert/cmdline_reference
-      - title: "Python API"
-        path: /lite/convert/python_api
       - title: "Convert quantized models"
         path: /lite/convert/quantization
 
@@ -81,6 +81,8 @@ upper_tabs:
         path: /lite/performance/gpu
       - title: "Advanced GPU"
         path: /lite/performance/gpu_advanced
+      - title: "Quantization specification"
+        path: /lite/performance/quantization_spec
 
       - heading: "Build TensorFlow Lite"
       - title: "Build for iOS"
diff --git a/tensorflow/lite/g3doc/guide/build_arm64.md b/tensorflow/lite/g3doc/guide/build_arm64.md
index 0daa45abd02..1a34239e3f4 100644
--- a/tensorflow/lite/g3doc/guide/build_arm64.md
+++ b/tensorflow/lite/g3doc/guide/build_arm64.md
@@ -32,7 +32,7 @@ Compile:
 ```
 
 This should compile a static library in:
-`tensorflow/lite/gen/gen/aarch64_armv8-a/lib/libtensorflow-lite.a`.
+`tensorflow/lite/tools/make/gen/aarch64_armv8-a/lib/libtensorflow-lite.a`.
 
 ## Native compiling
 
diff --git a/tensorflow/lite/g3doc/guide/faq.md b/tensorflow/lite/g3doc/guide/faq.md
index a0e4d09ef1e..c1f9985ada7 100644
--- a/tensorflow/lite/g3doc/guide/faq.md
+++ b/tensorflow/lite/g3doc/guide/faq.md
@@ -30,8 +30,16 @@ Since the number of TensorFlow Lite operations is smaller than TensorFlow's,
 some inference models may not be able to convert. For unimplemented operations,
 take a look at the question on
 [missing operators](faq.md#why-are-some-operations-not-implemented-in-tensorflow-lite).
-Unsupported operators include embeddings and LSTM/RNNs. For conversion issues
-not related to missing operations, search our
+Unsupported operators include embeddings and LSTM/RNNs. For models with
+LSTM/RNNs, you can also try the experimental API
+[OpHint](https://www.tensorflow.org/api_docs/python/tf/lite/OpHint) to convert.
+Models with control flow ops (Switch, Merge, etc) are not convertible at the
+moment, but we are working on adding support for control flow in Tensorflow
+Lite, please see
+[GitHub issues](https://github.com/tensorflow/tensorflow/issues/28485).
+
+For conversion issues not related to missing operations or control flow ops,
+search our
 [GitHub issues](https://github.com/tensorflow/tensorflow/issues?q=label%3Acomp%3Alite+)
 or file a [new one](https://github.com/tensorflow/tensorflow/issues).
 
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 5fdf77b4011..066fa0980a3 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -1,9 +1,16 @@
 # Post-training quantization
 
-Post-training quantization is a general technique to reduce model size while also
-providing up to 3x lower latency with little degradation in model accuracy. Post-training
-quantization quantizes weights from floating point to 8-bits of precision. This technique
-is enabled as an option in the [TensorFlow Lite converter](../convert/):
+Post-training quantization includes general techniques to reduce model size
+while also improving CPU and hardware accelerator latency with little
+degradation in model accuracy. These techniques can be performed on an
+already-trained float TensorFlow model and applied during TensorFlow Lite
+conversion.
+
+### Quantizing weights
+
+The simplest form of post-training quantization quantizes weights from floating
+point to 8-bits of precision. This technique is enabled as an option in the
+[TensorFlow Lite converter](../convert/):
 
 ```
 import tensorflow as tf
@@ -28,56 +35,70 @@ Hybrid ops are available for the most compute-intensive operators in a network:
 *  [tf.nn.bidirectional_dynamic_rnn for BasicRNNCell type](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
 *  [tf.nn.dynamic_rnn for LSTM and BasicRNN Cell types](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
 
+### Full integer quantization of weights and activations
 
-Since weights are quantized post training, there could be an accuracy loss, particularly for
-smaller networks. Pre-trained fully quantized models are provided for specific networks in
-the [TensorFlow Lite model repository](../models/). It is important to check the accuracy of the quantized model to verify that any degradation
-in accuracy is within acceptable limits. There is a tool to evaluate [TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/README.md){:.external}.
+We can get further latency improvements, reductions in peak memory usage, and
+access to integer only hardware accelerators by making sure all model math is
+quantized. To do this, we need to measure the dynamic range of activations and
+inputs with a representative data set. You can simply create an input data
+generator and provide it to our converter.
 
-If the accuracy drop is too high, consider using [quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}.
+```
+import tensorflow as tf
+
+def representative_dataset_gen():
+  for _ in range(num_calibration_steps):
+    # Get sample input data as a numpy array in a method of your choosing.
+    yield [input]
+
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.representative_dataset = representative_dataset_gen
+tflite_quant_model = converter.convert()
+```
+
+The resulting model will be fully quantized but still take float input and
+output for convenience.
+
+Ops that do not have quantized implementations will automatically be left in
+floating point. This allows conversion to occur smoothly but may restrict
+deployment to accelerators that support float. To require the converter to only
+output integer operations, one can specify:
+
+```
+converter.target_ops = [tf.lite.OpSet.TFLITE_BUILTINS_INT8]
+```
+
+This makes the converter throw an error if it encounters an operation it cannot
+currently quantize.
+
+### Model accuracy
+
+Since weights are quantized post training, there could be an accuracy loss,
+particularly for smaller networks. Pre-trained fully quantized models are
+provided for specific networks in the
+[TensorFlow Lite model repository](../models/). It is important to check the
+accuracy of the quantized model to verify that any degradation in accuracy is
+within acceptable limits. There is a tool to evaluate
+[TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/README.md){:.external}.
+
+If the accuracy drop is too high, consider using
+[quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}.
 
 ### Representation for quantized tensors
 
-TensorFlow approaches the conversion of floating-point arrays of numbers into
-8-bit representations as a compression problem. Since the weights and activation
-tensors in trained neural network models tend to have values that are distributed
-across comparatively small ranges (e.g. -15 to +15 for weights or -500 to
-1000 for image model activations).
+8-bit quantization approximates floating point values using the following
+formula. `real_value = (int8_value - zero_point) * scale`.
 
-Since neural networks tend to be robust at handling noise, the error introduced
-by quantizing to a small set of values maintains the precision of the overall
-results within an acceptable threshold. A chosen representation must perform
-fast calculations, especially with large matrix multiplications that comprise
-the bulk of the computations while running a model.
+The representation has two main parts:
 
-This is represented with two floats that store the overall minimum and maximum
-values corresponding to the lowest and highest quantized value. Each entry in the
-quantized array represents a float value in that range, distributed linearly
-between the minimum and maximum.
+*   Per-axis (aka per-channel) or per-layer weights represented by int8 two’s
+    complement values in the range [-127, 127] with zero-point equal to 0.
 
-With our post-training quantization tooling, we use symmetric quantization for
-our weights, meaning we expand the represented range and force the min and max
-to be the negative of each other.
+*   Per-layer activations/inputs represented by int8 two’s complement values in
+    the range [-128, 127], with a zero-point in range [-128, 127].
 
-For example, with an overall minimum of -10.0 and a maximum
-of 30.0f, we instead represent a minimum of -30.0 and maximum of 30.0f. In an
-8-bit array, the quantized values would be represented as follows:
-
-<figure>
-  <table>
-    <tr><th>Quantized</th><th>Float</th></tr>
-    <tr><td>-42</td><td>-10.0</td></tr>
-    <tr><td>0</td><td>0</td></tr>
-    <tr><td>127</td><td>30.0</td></tr>
-    <tr><td>-127</td><td>30.0 (this value does not ever show up)</td></tr>
-  </table>
-  <figcaption>
-    <b>Table 2</b>: Quantized value range example
-  </figcaption>
-</figure>
-
-The advantages of this representation format are:
-
-* It efficiently represents an arbitrary magnitude of ranges.
-* The linear spread makes multiplications straightforward.
-* A symmetric range for weights enables downstream hardware optimizations.
+For a detailed view of our quantization scheme, please see our
+[quantization spec](./quantization_spec.md). Hardware vendors who want to plug
+into TensorFlow Lite's delegate interface are encouraged to implement the
+quantization scheme described there.
diff --git a/tensorflow/lite/g3doc/performance/quantization_spec.md b/tensorflow/lite/g3doc/performance/quantization_spec.md
new file mode 100644
index 00000000000..f1e37557fc7
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/quantization_spec.md
@@ -0,0 +1,85 @@
+# TensorFlow Lite 8-bit quantization specification
+
+### Specification summary
+
+8-bit quantization approximates floating point values using the following
+formula.
+
+$$real\_value = (int8\_value - zero\_point) \times scale$$
+
+Per-axis (aka per-channel) or per-layer weights are represented by `int8` two’s
+complement values in the range `[-127, 127]` with zero-point equal to 0.
+Per-layer activations/inputs are represented by `int8` two’s complement values
+in the range `[-128, 127]`, with a zero-point in range `[-128, 127]`.
+
+There are other exceptions for particular operations that are documented below.
+
+Note: In the past our quantized tooling used per-layer, asymmetric, `uint8`
+quantization. New tooling, reference kernels, and optimized kernels for 8-bit
+quantization will use this spec.
+
+### Signed integer vs unsigned integer
+
+TensorFlow Lite quantization will primarily prioritize tooling and kernels for
+`int8` quantization for 8-bit. This is for the convenience of symmetric
+quantization being represented by zero-point equal to 0. Additionally many
+backends have additional optimizations for `int8xint8` accumulation.
+
+### Per-axis vs per-layer
+
+Per-layer quantization means that there will be one scale and/or zero-point per
+entire tensor. Per-axis quantization means that there will be one scale and/or
+`zero_point` per slice in the `quantized_dimension`. The quantized dimension
+specifies the dimension of the Tensor's shape that the scales and zero-points
+correspond to. For example, a tensor `t`, with `dims=[4, 3, 2, 1]` with
+quantization params: `scale=[1.0, 2.0, 3.0]`, `zero_point=[1, 2, 3]`,
+`quantization_dimension=1` will be quantized across the second dimension of t:
+
+    t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+    t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+    t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+
+Often, the quantized_dimension is the output_channel of the weights of
+convolutions, but in theory it can be the dimension that corresponds to each
+dot-product in the kernel implementation, allowing more quantization granularity
+without performance implications. This has large improvements to accuracy.
+
+TFLite has per-axis support for a growing number of operations. At the time of
+this document support exists for Conv2d and DepthwiseConv2d.
+
+### Symmetric vs asymmetric
+
+Activations are asymmetric: they can have their zero-point anywhere within the
+signed `int8` range `[-128, 127]`. Many activations are asymmetric in nature and
+a zero-point is an relatively inexpensive way to effectively get up to an extra
+binary bit of precision. Since activations are only multiplied by constant
+weights, the constant zero-point value can be optimized pretty heavily.
+
+Weights are symmetric: forced to have zero-point equal to 0. Weight values are
+multiplied by dynamic input and activation values. This means that there is a
+unavoidable runtime cost of multiplying the zero-point of the weight with the
+activation value. By enforcing that zero-point is 0 we can avoid this cost.
+
+Explanation of the math:
+
+$$A$$ is a $$m \times n$$ matrix of quantized activations. <br />
+$$B$$ is a $$n \times p$$ matrix of quantized weights. <br />
+Consider multiplying the $$j$$th row of $$A$$, $$a_j$$ by the $$k$$th row of
+$$B$$, $$b_k$$, both of length $$n$$. The quantized integer values and
+zero-points values are $$q_a$$, $$z_a$$ and $$q_b$$, $$q_b$$ respectively.
+
+$$a_j \cdot b_k = \sum_{i=0}^{n} a_{j}^{(i)} b_{k}^{(i)} =
+\sum_{i=0}^{n} (q_{a}^{(i)} - z_a) (q_{b}^{(i)} - z_b) =
+\sum_{i=0}^{n} q_{a}^{(i)} q_{b}^{(i)} - \sum_{i=0}^{n} q_{a}^{(i)} z_b -
+\sum_{i=0}^{n} q_{b}^{(i)} z_a + \sum_{i=0}^{n} z_a z_b$$
+
+The $$\sum_{i=0}^{n} q_{a}^{(i)} q_{b}^{(i)}$$ term is unavoidable since it’s
+performing the dot product of the input value and the weight value.
+
+The $$\sum_{i=0}^{n} q_{b}^{(i)} z_a + \sum_{i=0}^{n} z_a z_b$$ terms are made
+up of constants that remain the same per inference invocation, and thus can be
+pre-calculated.
+
+The $$\sum_{i=0}^{n} q_{a}^{(i)} z_b$$ term needs to be computed every inference
+since the activation changes every inference. By enforcing weights to be
+symmetric we can remove the cost of this term.
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 2d72eea588a..44359f54432 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -78,41 +78,42 @@ template <>
 constexpr TfLiteType typeToTfLiteType<TfLiteFloat16>() {
   return kTfLiteFloat16;
 }
-// An interpreter for a graph of nodes that input and output from tensors.
-// Each node of the graph processes a set of input tensors and produces a
-// set of output Tensors. All inputs/output tensors are referenced by index.
-//
-// Usage:
-//
-// -- Create basic model
-// Interpreter foo(2, 1);
-// foo.SetTensorParametersReadWrite(0, ...);
-// foo.SetTensorParametersReadOnly(1, ...);
-// foo.SetNodeParameters(0, ...)
-//
-// -- Resize input array to 1 length.
-// foo.ResizeInputTensor(0, 1);
-// foo.AllocateTensors();
-// -- Install array data
-// foo.typed_tensor<float>(0)[0] = 3;
-// foo.Invoke();
-// foo.typed_tensor<float>(0)[0] = 4;
-// foo.Invoke();
-// -- Resize input array and set data.
-// foo.ResizeInputTensor(0, 2);
-// foo.AllocateTensors();
-// foo.typed_tensor<float>(0)[0] = 4;
-// foo.typed_tensor<float>(0)[1] = 8;
-// foo.Invoke();
-//
+/// An interpreter for a graph of nodes that input and output from tensors.
+/// Each node of the graph processes a set of input tensors and produces a
+/// set of output Tensors. All inputs/output tensors are referenced by index.
+///
+/// Usage:
+///
+/// <pre><code>
+/// // Create basic model
+/// Interpreter foo(2, 1);
+/// foo.SetTensorParametersReadWrite(0, ...);
+/// foo.SetTensorParametersReadOnly(1, ...);
+/// foo.SetNodeParameters(0, ...)
+/// // Resize input array to 1 length.
+/// foo.ResizeInputTensor(0, 1);
+/// foo.AllocateTensors();
+/// // Install array data
+/// foo.typed_tensor<float>(0)[0] = 3;
+/// foo.Invoke();
+/// foo.typed_tensor<float>(0)[0] = 4;
+/// foo.Invoke();
+/// // Resize input array and set data.
+/// foo.ResizeInputTensor(0, 2);
+/// foo.AllocateTensors();
+/// foo.typed_tensor<float>(0)[0] = 4;
+/// foo.typed_tensor<float>(0)[1] = 8;
+/// foo.Invoke();
+/// </code></pre>
+///
 
 class Interpreter {
  public:
-  // Instantiate an interpreter. All errors associated with reading and
-  // processing this model will be forwarded to the error_reporter object.
+  /// Instantiate an interpreter. All errors associated with reading and
+  /// processing this model will be forwarded to the error_reporter object.
   //
-  // Note, if error_reporter is nullptr, then a default StderrReporter is
-  // used. Ownership of 'error_reporter' remains with the caller.
+  /// Note, if error_reporter is nullptr, then a default StderrReporter is
+  /// used. Ownership of 'error_reporter' remains with the caller.
   explicit Interpreter(ErrorReporter* error_reporter = DefaultErrorReporter());
 
   ~Interpreter();
@@ -122,31 +123,31 @@ class Interpreter {
   Interpreter& operator=(const Interpreter&) = delete;
 
   // Functions to build interpreter
-
-  // Provide a list of tensor indexes that are inputs to the model.
-  // Each index is bound check and this modifies the consistent_ flag of the
-  // interpreter.
+#ifndef DOXYGEN_SKIP
+  /// Provide a list of tensor indexes that are inputs to the model.
+  /// Each index is bound check and this modifies the consistent_ flag of the
+  /// interpreter.
   TfLiteStatus SetInputs(std::vector<int> inputs);
 
-  // Provide a list of tensor indexes that are outputs to the model
-  // Each index is bound check and this modifies the consistent_ flag of the
-  // interpreter.
+  /// Provide a list of tensor indexes that are outputs to the model
+  /// Each index is bound check and this modifies the consistent_ flag of the
+  /// interpreter.
   TfLiteStatus SetOutputs(std::vector<int> outputs);
 
-  // Provide a list of tensor indexes that are variable tensors.
-  // Each index is bound check and this modifies the consistent_ flag of the
-  // interpreter.
+  /// Provide a list of tensor indexes that are variable tensors.
+  /// Each index is bound check and this modifies the consistent_ flag of the
+  /// interpreter.
   TfLiteStatus SetVariables(std::vector<int> variables);
 
-  // Ensure the internal node storage memory allocates at least `count`
-  // spots for node. NOTE, this doesn't actually add operators. This is an
-  // efficiency optimization that is subject to change.
+  /// Ensure the internal node storage memory allocates at least `count`
+  /// spots for node. NOTE, this doesn't actually add operators. This is an
+  /// efficiency optimization that is subject to change.
   void ReserveNodes(int count);
 
-  // Adds a node with the given parameters and returns the index of the new
-  // node in `node_index` (optionally). Interpreter will take ownership of
-  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
-  // remains with the caller.
+  /// Adds a node with the given parameters and returns the index of the new
+  /// node in `node_index` (optionally). Interpreter will take ownership of
+  /// `builtin_data` and destroy it with `free`. Ownership of 'init_data'
+  /// remains with the caller.
   TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
                                      const std::vector<int>& outputs,
                                      const char* init_data,
@@ -154,22 +155,22 @@ class Interpreter {
                                      const TfLiteRegistration* registration,
                                      int* node_index = nullptr);
 
-  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
-  // The value pointed to by `first_new_tensor_index` will be set to the
-  // index of the first new tensor if `first_new_tensor_index` is non-null.
+  /// Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  /// The value pointed to by `first_new_tensor_index` will be set to the
+  /// index of the first new tensor if `first_new_tensor_index` is non-null.
   TfLiteStatus AddTensors(int tensors_to_add,
                           int* first_new_tensor_index = nullptr);
 
-  // Set description of inputs/outputs/data/fptrs for node `node_index`.
-  // This variant assumes an external buffer has been allocated of size
-  // bytes. The lifetime of buffer must be ensured to be greater or equal
-  // to Interpreter.
+  /// Set description of inputs/outputs/data/fptrs for node `node_index`.
+  /// This variant assumes an external buffer has been allocated of size
+  /// bytes. The lifetime of buffer must be ensured to be greater or equal
+  /// to Interpreter.
   TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantization quantization,
       const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
 
-  // Legacy. Deprecated in favor of above.
+  /// Legacy. Deprecated in favor of above.
   inline TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -185,17 +186,17 @@ class Interpreter {
       const int* dims, TfLiteQuantizationParams quantization,
       const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
 
-  // Set description of inputs/outputs/data/fptrs for node `node_index`.
-  // This variant assumes an external buffer has been allocated of size
-  // bytes. The lifetime of buffer must be ensured to be greater or equal
-  // to Interpreter.
+  /// Set description of inputs/outputs/data/fptrs for node `node_index`.
+  /// This variant assumes an external buffer has been allocated of size
+  /// bytes. The lifetime of buffer must be ensured to be greater or equal
+  /// to Interpreter.
   TfLiteStatus SetTensorParametersReadWrite(int tensor_index, TfLiteType type,
                                             const char* name,
                                             const std::vector<int>& dims,
                                             TfLiteQuantization quantization,
                                             bool is_variable = false);
 
-  // Legacy. Deprecated in favor of above.
+  /// Legacy. Deprecated in favor of above.
   inline TfLiteStatus SetTensorParametersReadWrite(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -207,69 +208,72 @@ class Interpreter {
       int tensor_index, TfLiteType type, const char* name, const size_t rank,
       const int* dims, TfLiteQuantizationParams quantization,
       bool is_variable = false);
-
+#endif  // DOXYGEN_SKIP
   // Functions to access tensor data
 
-  // Read only access to list of inputs.
+  /// Read only access to list of inputs.
   const std::vector<int>& inputs() const { return primary_subgraph().inputs(); }
 
-  // Return the name of a given input. The given index must be between 0 and
-  // inputs().size().
+  /// Return the name of a given input. The given index must be between 0 and
+  /// inputs().size().
   const char* GetInputName(int index) const {
     return context_->tensors[inputs()[index]].name;
   }
 
-  // Read only access to list of outputs.
+  /// Read only access to list of outputs.
   const std::vector<int>& outputs() const {
     return primary_subgraph().outputs();
   }
 
-  // Read only access to list of variable tensors.
+  /// Read only access to list of variable tensors.
   const std::vector<int>& variables() const {
     return primary_subgraph().variables();
   }
 
-  // Return the name of a given output. The given index must be between 0 and
-  // outputs().size().
+  /// Return the name of a given output. The given index must be between 0 and
+  /// outputs().size().
   const char* GetOutputName(int index) const {
     return context_->tensors[outputs()[index]].name;
   }
 
-  // Return the number of tensors in the model.
+  /// Return the number of tensors in the model.
   size_t tensors_size() const { return context_->tensors_size; }
 
-  // Return the number of ops in the model.
+  /// Return the number of ops in the model.
   size_t nodes_size() const { return primary_subgraph().nodes_size(); }
 
-  // WARNING: Experimental interface, subject to change
+  /// WARNING: Experimental interface, subject to change
   const std::vector<int>& execution_plan() const {
     return primary_subgraph().execution_plan();
   }
 
-  // WARNING: Experimental interface, subject to change
-  // Overrides execution plan. This bounds checks indices sent in.
+#ifndef DOXYGEN_
+  /// WARNING: Experimental interface, subject to change
+  /// Overrides execution plan. This bounds checks indices sent in.
   TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+#endif  // DOXYGEN_SKIP
 
-  // Get a mutable tensor data structure.
+  /// Get a mutable tensor data structure.
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
     return primary_subgraph().tensor(tensor_index);
   }
 
-  // Get an immutable tensor data structure.
+  /// Get an immutable tensor data structure.
   const TfLiteTensor* tensor(int tensor_index) const {
     return primary_subgraph().tensor(tensor_index);
   }
 
-  // Get a pointer to an operation and registration data structure if in bounds.
+  /// Get a pointer to an operation and registration data structure if in
+  /// bounds.
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
       int node_index) const {
     return primary_subgraph().node_and_registration(node_index);
   }
 
-  // Perform a checked cast to the appropriate tensor type (mutable pointer
-  // version).
+  /// Perform a checked cast to the appropriate tensor type (mutable pointer
+  /// version).
   template <class T>
   T* typed_tensor(int tensor_index) {
     if (TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
@@ -280,8 +284,8 @@ class Interpreter {
     return nullptr;
   }
 
-  // Perform a checked cast to the appropriate tensor type (immutable pointer
-  // version).
+  /// Perform a checked cast to the appropriate tensor type (immutable pointer
+  /// version).
   template <class T>
   const T* typed_tensor(int tensor_index) const {
     if (const TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
@@ -292,181 +296,182 @@ class Interpreter {
     return nullptr;
   }
 
-  // Return a mutable pointer into the data of a given input tensor. The given
-  // index must be between 0 and inputs().size().
+  /// Return a mutable pointer into the data of a given input tensor. The given
+  /// index must be between 0 and inputs().size().
   template <class T>
   T* typed_input_tensor(int index) {
     return typed_tensor<T>(inputs()[index]);
   }
 
-  // Return an immutable pointer into the data of a given input tensor. The
-  // given index must be between 0 and inputs().size().
+  /// Return an immutable pointer into the data of a given input tensor. The
+  /// given index must be between 0 and inputs().size().
   template <class T>
   const T* typed_input_tensor(int index) const {
     return typed_tensor<T>(inputs()[index]);
   }
 
-  // Return a mutable pointer into the data of a given output tensor. The given
-  // index must be between 0 and outputs().size().
+  /// Return a mutable pointer into the data of a given output tensor. The given
+  /// index must be between 0 and outputs().size().
   template <class T>
   T* typed_output_tensor(int index) {
     return typed_tensor<T>(outputs()[index]);
   }
 
-  // Return an immutable pointer into the data of a given output tensor. The
-  // given index must be between 0 and outputs().size().
+  /// Return an immutable pointer into the data of a given output tensor. The
+  /// given index must be between 0 and outputs().size().
   template <class T>
   const T* typed_output_tensor(int index) const {
     return typed_tensor<T>(outputs()[index]);
   }
 
-  // Change the dimensionality of a given tensor. Note, this is only acceptable
-  // for tensor indices that are inputs.
-  // Returns status of failure or success.
-  // TODO(aselle): Consider implementing ArraySlice equivalent to make this
-  //   more adept at accepting data without an extra copy. Use absl::ArraySlice
-  //   if our partners determine that dependency is acceptable.
+  /// Change the dimensionality of a given tensor. Note, this is only acceptable
+  /// for tensor indices that are inputs.
+  /// Returns status of failure or success.
+  /// TODO(aselle): Consider implementing ArraySlice equivalent to make this
+  ///   more adept at accepting data without an extra copy. Use absl::ArraySlice
+  ///   if our partners determine that dependency is acceptable.
   TfLiteStatus ResizeInputTensor(int tensor_index,
                                  const std::vector<int>& dims);
 
-  // Update allocations for all tensors. This will redim dependent tensors using
-  // the input tensor dimensionality as given. This is relatively expensive.
-  // If you know that your sizes are not changing, you need not call this.
-  // Returns status of success or failure.
+  /// Update allocations for all tensors. This will redim dependent tensors
+  /// using the input tensor dimensionality as given. This is relatively
+  /// expensive. If you know that your sizes are not changing, you need not call
+  /// this. Returns status of success or failure.
   TfLiteStatus AllocateTensors();
 
-  // Invoke the interpreter (run the whole graph in dependency order).
-  //
-  // NOTE: It is possible that the interpreter is not in a ready state
-  // to evaluate (i.e. if a ResizeTensor() has been performed without an
-  // AllocateTensors().
-  // Returns status of success or failure.
+  /// Invoke the interpreter (run the whole graph in dependency order).
+  ///
+  /// NOTE: It is possible that the interpreter is not in a ready state
+  /// to evaluate (i.e. if a ResizeTensor() has been performed without an
+  /// AllocateTensors().
+  /// Returns status of success or failure.
   TfLiteStatus Invoke();
 
-  // Enable or disable the NN API (true to enable)
+  /// Enable or disable the NN API (true to enable)
   void UseNNAPI(bool enable);
 
-  // Set the number of threads available to the interpreter.
+  /// Set the number of threads available to the interpreter.
   void SetNumThreads(int num_threads);
 
-  // Allow float16 precision for FP32 calculation when possible.
-  // default: not allow.
-  // WARNING: This is an experimental API and subject to change.
+  /// Allow float16 precision for FP32 calculation when possible.
+  /// default: not allow.
+  /// WARNING: This is an experimental API and subject to change.
   void SetAllowFp16PrecisionForFp32(bool allow);
 
-  // Get the half precision flag.
-  // WARNING: This is an experimental API and subject to change.
+  /// Get the half precision flag.
+  /// WARNING: This is an experimental API and subject to change.
   bool GetAllowFp16PrecisionForFp32() const {
     return context_->allow_fp32_relax_to_fp16;
   }
 
-  // Sets the cancellation function pointer in order to cancel a request in the
-  // middle of a call to Invoke(). The interpreter queries this function during
-  // inference, between op invocations; when it returns true, the interpreter
-  // will abort execution and return `kTfLiteError`. The `data` parameter
-  // contains any data used by the cancellation function, and if non-null,
-  // remains owned by the caller.
-  // WARNING: This is an experimental API and subject to change.
+  /// Sets the cancellation function pointer in order to cancel a request in the
+  /// middle of a call to Invoke(). The interpreter queries this function during
+  /// inference, between op invocations; when it returns true, the interpreter
+  /// will abort execution and return `kTfLiteError`. The `data` parameter
+  /// contains any data used by the cancellation function, and if non-null,
+  /// remains owned by the caller.
+  /// WARNING: This is an experimental API and subject to change.
   void SetCancellationFunction(void* data, bool (*check_cancelled_func)(void*));
 
   // Owning handle to a TfLiteDelegate instance.
   using TfLiteDelegatePtr =
       std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
 
-  // Allow a delegate to look at the graph and modify the graph to handle
-  // parts of the graph themselves. After this is called, the graph may
-  // contain new nodes that replace 1 more nodes.
-  // WARNING: This is an experimental API and subject to change.
+  /// Allow a delegate to look at the graph and modify the graph to handle
+  /// parts of the graph themselves. After this is called, the graph may
+  /// contain new nodes that replace 1 more nodes.
+  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
 
-  // Ensure the data in `tensor.data` is readable. In case delegate is used,
-  // it might require to copy the data from delegate buffer to raw memory.
-  // WARNING: This is an experimental API and subject to change.
+  /// Ensure the data in `tensor.data` is readable. In case delegate is used,
+  /// it might require to copy the data from delegate buffer to raw memory.
+  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
     return primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
   }
 
-  // Set the delegate buffer handle to a tensor. It can be called in the
-  // following cases:
-  // 1. Set the buffer handle to a tensor that's not being written by a
-  //    delegate. For example, feeding an OpenGL texture as the input of the
-  //    inference graph.
-  // 2. Set the buffer handle to a tensor that uses the same delegate.
-  //    For example, set an OpenGL texture as the output of inference, while
-  //    the node which produces output is an OpenGL delegate node.
-  // WARNING: This is an experimental API and subject to change.
+  /// Set the delegate buffer handle to a tensor. It can be called in the
+  /// following cases:
+  /// 1. Set the buffer handle to a tensor that's not being written by a
+  ///    delegate. For example, feeding an OpenGL texture as the input of the
+  ///    inference graph.
+  /// 2. Set the buffer handle to a tensor that uses the same delegate.
+  ///    For example, set an OpenGL texture as the output of inference, while
+  ///    the node which produces output is an OpenGL delegate node.
+  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus SetBufferHandle(int tensor_index,
                                TfLiteBufferHandle buffer_handle,
                                TfLiteDelegate* delegate);
 
-  // Get the delegate buffer handle, and the delegate which can process the
-  // buffer handle.
-  // WARNING: This is an experimental API and subject to change.
+  /// Get the delegate buffer handle, and the delegate which can process the
+  /// buffer handle.
+  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus GetBufferHandle(int tensor_index,
                                TfLiteBufferHandle* buffer_handle,
                                TfLiteDelegate** delegate);
 
-  // Sets the profiler to tracing execution. The caller retains ownership
-  // of the profiler and must ensure its validity.
-  // WARNING: This is an experimental API and subject to change.
+  /// Sets the profiler to tracing execution. The caller retains ownership
+  /// of the profiler and must ensure its validity.
+  /// WARNING: This is an experimental API and subject to change.
   void SetProfiler(Profiler* profiler);
 
-  // Gets the profiler used for op tracing.
-  // WARNING: This is an experimental API and subject to change.
+  /// Gets the profiler used for op tracing.
+  /// WARNING: This is an experimental API and subject to change.
   Profiler* GetProfiler();
 
   // The default capacity of `tensors_` vector.
   static constexpr int kTensorsReservedCapacity = 128;
-  // The capacity headroom of `tensors_` vector before calling ops'
-  // `prepare` and `invoke` function. In these functions, it's guaranteed
-  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
-  // pointers to existing tensors.
+  /// The capacity headroom of `tensors_` vector before calling ops'
+  /// `prepare` and `invoke` function. In these functions, it's guaranteed
+  /// allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
+  /// pointers to existing tensors.
   static constexpr int kTensorsCapacityHeadroom = 16;
 
-  // Set if buffer handle output is allowed.
+  /// Set if buffer handle output is allowed.
   //
-  // When using hardware delegation, Interpreter will make the data of output
-  // tensors available in `tensor->data` by default. If the application can
-  // consume the buffer handle directly (e.g. reading output from OpenGL
-  // texture), it can set this flag to false, so Interpreter won't copy the data
-  // from buffer handle to CPU memory.
-  // WARNING: This is an experimental API and subject to change.
+  /// When using hardware delegation, Interpreter will make the data of output
+  /// tensors available in `tensor->data` by default. If the application can
+  /// consume the buffer handle directly (e.g. reading output from OpenGL
+  /// texture), it can set this flag to false, so Interpreter won't copy the
+  /// data from buffer handle to CPU memory. WARNING: This is an experimental
+  /// API and subject to change.
   void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) {
     allow_buffer_handle_output_ = allow_buffer_handle_output;
   }
 
-  // Reset all variable tensors to the default value.
-  // If a variable tensor doesn't have a buffer, reset it to zero.
-  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
-  // to the value of the buffer.
-  // WARNING: This is an experimental API and subject to change.
+  /// Reset all variable tensors to the default value.
+  /// If a variable tensor doesn't have a buffer, reset it to zero.
+  /// TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  /// to the value of the buffer.
+  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus ResetVariableTensors();
 
-  // Retrieve an operator's description of its work, for profiling purposes.
+  /// Retrieve an operator's description of its work, for profiling purposes.
   const char* OpProfilingString(const TfLiteRegistration& op_reg,
                                 const TfLiteNode* node) const {
     if (op_reg.profiling_string == nullptr) return nullptr;
     return op_reg.profiling_string(context_, node);
   }
 
-  // Set the value of an external context.
+  /// Set the value of an external context.
   void SetExternalContext(TfLiteExternalContextType type,
                           TfLiteExternalContext* ctx);
 
-  // Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
-  // entries. The value pointed to by `first_new_subgraph_index` will be set to
-  // the index of the first new subgraph if `first_new_subgraph_index` is
-  // non-null.
-  // WARNING: This is an experimental API and subject to change.
+#ifndef DOXYGEN_SKIP
+  /// Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
+  /// entries. The value pointed to by `first_new_subgraph_index` will be set to
+  /// the index of the first new subgraph if `first_new_subgraph_index` is
+  /// non-null.
+  /// WARNING: This is an experimental API and subject to change.
   void AddSubgraphs(int subgraphs_to_add,
                     int* first_new_subgraph_index = nullptr);
 
-  // Return the number of subgraphs in the model.
-  // WARNING: This is an experimental API and subject to change.
+  /// Return the number of subgraphs in the model.
+  /// WARNING: This is an experimental API and subject to change.
   size_t subgraphs_size() const { return subgraphs_.size(); }
 
-  // Get a pointer to a subgraph if in bounds.
-  // WARNING: This is an experimental API and subject to change.
+  /// Get a pointer to a subgraph if in bounds.
+  /// WARNING: This is an experimental API and subject to change.
   Subgraph* subgraph(int subgraph_index) {
     if (subgraph_index < 0 ||
         static_cast<size_t>(subgraph_index) >= subgraphs_size())
@@ -474,28 +479,29 @@ class Interpreter {
     return &*subgraphs_[subgraph_index];
   }
 
-  // WARNING: Experimental interface, subject to change
+  /// WARNING: Experimental interface, subject to change
   Subgraph& primary_subgraph() {
-    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
+    return *subgraphs_.front();  /// Safe as subgraphs_ always has 1 entry.
   }
 
-  // WARNING: Experimental interface, subject to change
+  /// WARNING: Experimental interface, subject to change
   const Subgraph& primary_subgraph() const {
     return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
   }
+#endif  // DOXYGEN_SKIP
 
  private:
   friend class InterpreterBuilder;
   friend class InterpreterTest;
 
-  // Set the value of an external context.
+  /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
                                  TfLiteExternalContextType type,
                                  TfLiteExternalContext* ctx);
 
-  // Variant of the public ModifyGraphWithDelegate method that additionally
-  // Assumes ownership of the provided delegate.
-  // WARNING: This is an experimental API and subject to change.
+  /// Variant of the public ModifyGraphWithDelegate method that additionally
+  /// Assumes ownership of the provided delegate.
+  /// WARNING: This is an experimental API and subject to change.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegatePtr delegate) {
     // Note that we retain ownership of the delegate even if graph modification
     // fails, as delegate use will be in an indeterminate state at that point.
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 4bab056365c..17edf8f3d2b 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -13,12 +13,7 @@ load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")
 
 JAVA_SRCS = glob([
     "src/main/java/org/tensorflow/lite/*.java",
-]) + select({
-    "//tensorflow:android": [
-        "//tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi:nnapi_delegate_src",
-    ],
-    "//conditions:default": [],
-})
+]) + ["//tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi:nnapi_delegate_src"]
 
 # Building tensorflow-lite.aar including 4 variants of .so
 # To build an aar for release, run below command:
diff --git a/tensorflow/lite/java/demo/app/src/main/BUILD b/tensorflow/lite/java/demo/app/src/main/BUILD
index 8adf562d0f1..e1574d4c83d 100644
--- a/tensorflow/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/lite/java/demo/app/src/main/BUILD
@@ -1,8 +1,9 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 android_binary(
     name = "TfLiteCameraDemo",
diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
index b00c9cd0580..43e6128ba16 100644
--- a/tensorflow/lite/java/ovic/BUILD
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -54,7 +54,6 @@ android_library(
         "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
         "src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java",
     ],
-    manifest = "//tensorflow/lite/java:AndroidManifest.xml",
     tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
diff --git a/tensorflow/lite/java/ovic/demo/app/BUILD b/tensorflow/lite/java/ovic/demo/app/BUILD
index b3548deaf53..1295c61e29a 100644
--- a/tensorflow/lite/java/ovic/demo/app/BUILD
+++ b/tensorflow/lite/java/ovic/demo/app/BUILD
@@ -1,7 +1,10 @@
+# Sample app for OVIC benchmarking.
+
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
-# Sample app for OVIC benchmarking.
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 android_binary(
     name = "ovic_benchmarker_binary",
diff --git a/tensorflow/lite/java/ovic/src/testdata/BUILD b/tensorflow/lite/java/ovic/src/testdata/BUILD
index b55d6230dfb..07cb48e6b16 100644
--- a/tensorflow/lite/java/ovic/src/testdata/BUILD
+++ b/tensorflow/lite/java/ovic/src/testdata/BUILD
@@ -1,6 +1,8 @@
 # Testdata for OVIC benchmarker demo App and tests.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 filegroup(
     name = "ovic_testdata",
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index e5ec209b2e7..676fb78ed1c 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -2,12 +2,13 @@
 # Java Native Interface (JNI) library intended for implementing the
 # TensorFlow Lite Java API using the TensorFlow Lite CC library.
 
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
-licenses(["notice"])  # Apache 2.0
-
 cc_library(
     name = "native_framework_only",
     srcs = [
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index 994f2389b46..05fc5d6cf43 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # Java Native Interface (JNI) library for testing the TensorFlow Lite Java API.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
 
diff --git a/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
index 88641c86ed6..6bec3fca099 100644
--- a/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
+++ b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
@@ -3,9 +3,10 @@
 
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 android_library(
     name = "testhelper",
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index c2e923c1023..2160584c513 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -1,20 +1,28 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_opts_nortti_if_android")
 
-# Enables usage of ruy in TF Lite kernels.
+# Enables usage of ruy in TFLite kernels.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "tflite_with_ruy",
+    name = "tflite_with_ruy_explicit_true",
     define_values = {"tflite_with_ruy": "true"},
 )
 
+# Disables usage of ruy in TFLite kernels.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "tflite_with_ruy_explicit_false",
+    define_values = {"tflite_with_ruy": "false"},
+)
+
 # Suppress warnings that are introduced by Eigen Tensor.
 EXTRA_EIGEN_COPTS = select({
     "//tensorflow:ios": [
@@ -51,6 +59,7 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
@@ -59,6 +68,19 @@ cc_library(
     ],
 )
 
+# TODO(b/132204084): Create tflite_cc_test rule to automate test_main inclusion.
+cc_library(
+    name = "test_main",
+    testonly = 1,
+    srcs = ["test_main.cc"],
+    deps = [
+        ":test_util",
+        "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/tools:command_line_flags",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "eigen_support",
     srcs = [
@@ -87,6 +109,31 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "tflite_with_ruy_enabled",
+    defines = ["TFLITE_WITH_RUY"],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "tflite_with_ruy_default",
+    visibility = ["//visibility:private"],
+    deps = select({
+        # TODO(b/133306271): Enable for ARM64 after validating performance.
+        "//tensorflow:android_arm64": [],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "tflite_with_ruy",
+    deps = select({
+        ":tflite_with_ruy_explicit_true": [":tflite_with_ruy_enabled"],
+        ":tflite_with_ruy_explicit_false": [],
+        "//conditions:default": [":tflite_with_ruy_default"],
+    }),
+)
+
 cc_library(
     name = "cpu_backend_context",
     srcs = [
@@ -96,13 +143,8 @@ cc_library(
         "cpu_backend_context.h",
     ],
     copts = tflite_copts(),
-    defines = select({
-        "//tensorflow/lite/kernels:tflite_with_ruy": [
-            "TFLITE_WITH_RUY",
-        ],
-        "//conditions:default": [],
-    }),
     deps = [
+        ":tflite_with_ruy",
         ":op_macros",
         # For now this unconditionally depends on both ruy and gemmlowp.
         # See the comment inside class CpuBackendContext on the
@@ -119,6 +161,7 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
+        ":tflite_with_ruy",
         "//tensorflow/lite/kernels/internal:types",
         ":cpu_backend_context",
         # For now this unconditionally depends on both ruy and gemmlowp.
@@ -144,22 +187,19 @@ cc_test(
 cc_library(
     name = "cpu_backend_gemm",
     srcs = [
-        "cpu_backend_gemm_ruy.h",
         "cpu_backend_gemm_custom_gemv.h",
-    ] + select({
-        "//tensorflow/lite/kernels:tflite_with_ruy": [],
-        "//conditions:default": [
-            "cpu_backend_gemm_gemmlowp.h",
-            "cpu_backend_gemm_eigen.h",
-            "cpu_backend_gemm_eigen.cc",
-        ],
-    }),
+        "cpu_backend_gemm_eigen.cc",
+        "cpu_backend_gemm_eigen.h",
+        "cpu_backend_gemm_gemmlowp.h",
+        "cpu_backend_gemm_ruy.h",
+    ],
     hdrs = [
         "cpu_backend_gemm.h",
         "cpu_backend_gemm_params.h",
     ],
     copts = tflite_copts(),
     deps = [
+        ":tflite_with_ruy",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/kernels/internal:common",
         ":cpu_backend_context",
@@ -381,6 +421,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//third_party/eigen3",
         "@farmhash_archive//:farmhash",
         "@flatbuffers",
     ],
@@ -433,6 +474,7 @@ cc_test(
     srcs = ["audio_spectrogram_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -446,6 +488,7 @@ cc_test(
     srcs = ["mfcc_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -459,6 +502,7 @@ cc_test(
     srcs = ["detection_postprocess_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -472,6 +516,7 @@ cc_test(
     srcs = ["activations_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -484,6 +529,7 @@ cc_test(
     srcs = ["add_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -496,6 +542,7 @@ cc_test(
     srcs = ["add_n_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "@com_google_googletest//:gtest",
@@ -508,6 +555,7 @@ cc_test(
     srcs = ["arg_min_max_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -518,8 +566,10 @@ cc_test(
     name = "div_test",
     size = "small",
     srcs = ["div_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -530,8 +580,10 @@ cc_test(
     name = "sub_test",
     size = "small",
     srcs = ["sub_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -542,8 +594,10 @@ cc_test(
     name = "transpose_test",
     size = "small",
     srcs = ["transpose_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/kernels/internal:reference",
@@ -558,6 +612,7 @@ cc_test(
     srcs = ["space_to_batch_nd_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -570,6 +625,7 @@ cc_test(
     srcs = ["batch_to_space_nd_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -582,6 +638,7 @@ cc_test(
     srcs = ["cast_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -594,6 +651,7 @@ cc_test(
     srcs = ["concatenation_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -606,6 +664,7 @@ cc_test(
     srcs = ["conv_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_absl//absl/memory",
@@ -619,6 +678,7 @@ cc_test(
     srcs = ["depthwise_conv_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_absl//absl/memory",
@@ -630,8 +690,10 @@ cc_test(
     name = "dequantize_test",
     size = "small",
     srcs = ["dequantize_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/kernels/internal:types",
@@ -646,6 +708,7 @@ cc_test(
     srcs = ["basic_rnn_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -658,6 +721,7 @@ cc_test(
     srcs = ["bidirectional_sequence_lstm_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/schema:schema_fbs",
@@ -669,8 +733,10 @@ cc_test(
     name = "floor_test",
     size = "small",
     srcs = ["floor_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -686,6 +752,7 @@ cc_test(
     ],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -701,6 +768,7 @@ cc_test(
     ],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -713,6 +781,7 @@ cc_test(
     srcs = ["elementwise_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -725,6 +794,7 @@ cc_test(
     srcs = ["unidirectional_sequence_lstm_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -737,6 +807,7 @@ cc_test(
     srcs = ["bidirectional_sequence_rnn_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -749,6 +820,7 @@ cc_test(
     srcs = ["unidirectional_sequence_rnn_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -761,6 +833,7 @@ cc_test(
     srcs = ["l2norm_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -773,6 +846,7 @@ cc_test(
     srcs = ["exp_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -785,6 +859,7 @@ cc_test(
     srcs = ["fake_quant_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -797,6 +872,7 @@ cc_test(
     srcs = ["maximum_minimum_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -809,6 +885,7 @@ cc_test(
     srcs = ["reduce_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -821,6 +898,7 @@ cc_test(
     srcs = ["mul_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -833,6 +911,7 @@ cc_test(
     srcs = ["pad_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -845,6 +924,7 @@ cc_test(
     srcs = ["reshape_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -857,6 +937,7 @@ cc_test(
     srcs = ["gather_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -870,6 +951,7 @@ cc_test(
     srcs = ["gather_nd_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -883,6 +965,7 @@ cc_test(
     srcs = ["topk_v2_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -896,6 +979,7 @@ cc_test(
     srcs = ["resize_bilinear_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -908,6 +992,7 @@ cc_test(
     srcs = ["resize_nearest_neighbor_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -920,6 +1005,7 @@ cc_test(
     srcs = ["svdf_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -932,6 +1018,7 @@ cc_test(
     srcs = ["embedding_lookup_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -945,6 +1032,7 @@ cc_test(
     srcs = ["embedding_lookup_sparse_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -957,6 +1045,7 @@ cc_test(
     srcs = ["fully_connected_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/kernels/internal:tensor_utils",
@@ -969,8 +1058,10 @@ cc_test(
     name = "local_response_norm_test",
     size = "small",
     srcs = ["local_response_norm_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -983,6 +1074,7 @@ cc_test(
     srcs = ["pooling_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -993,8 +1085,10 @@ cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/kernels/internal:reference_base",
@@ -1008,6 +1102,7 @@ cc_test(
     srcs = ["log_softmax_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/kernels/internal:reference_base",
@@ -1019,8 +1114,10 @@ cc_test(
     name = "lsh_projection_test",
     size = "small",
     srcs = ["lsh_projection_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1033,6 +1130,7 @@ cc_test(
     srcs = ["hashtable_lookup_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:test_util",
@@ -1046,6 +1144,7 @@ cc_test(
     srcs = ["lstm_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1058,6 +1157,7 @@ cc_test(
     srcs = ["skip_gram_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:test_util",
@@ -1071,6 +1171,7 @@ cc_test(
     srcs = ["space_to_depth_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1083,6 +1184,7 @@ cc_test(
     srcs = ["split_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1095,6 +1197,7 @@ cc_test(
     srcs = ["split_v_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1107,6 +1210,7 @@ cc_test(
     srcs = ["squeeze_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1119,6 +1223,7 @@ cc_test(
     srcs = ["strided_slice_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1131,6 +1236,7 @@ cc_test(
     srcs = ["tile_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -1146,6 +1252,7 @@ cc_test(
     ],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1158,6 +1265,7 @@ cc_test(
     srcs = ["neg_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1172,6 +1280,7 @@ cc_test(
     ],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1186,6 +1295,7 @@ cc_test(
     ],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1198,6 +1308,7 @@ cc_test(
     srcs = ["transpose_conv_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_absl//absl/memory",
@@ -1211,6 +1322,7 @@ cc_test(
     srcs = ["expand_dims_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -1224,6 +1336,7 @@ cc_test(
     srcs = ["sparse_to_dense_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -1237,6 +1350,7 @@ cc_test(
     srcs = ["shape_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -1250,6 +1364,7 @@ cc_test(
     srcs = ["rank_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -1263,6 +1378,7 @@ cc_test(
     srcs = ["pow_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -1276,6 +1392,7 @@ cc_test(
     srcs = ["pack_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -1289,6 +1406,7 @@ cc_test(
     srcs = ["one_hot_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1301,6 +1419,7 @@ cc_test(
     srcs = ["logical_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:test_util",
@@ -1314,6 +1433,7 @@ cc_test(
     srcs = ["unpack_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1327,6 +1447,7 @@ cc_test(
     srcs = ["floor_div_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1340,6 +1461,7 @@ cc_test(
     srcs = ["where_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1353,6 +1475,7 @@ cc_test(
     srcs = ["zeros_like_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1366,6 +1489,7 @@ cc_test(
     srcs = ["floor_mod_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1379,6 +1503,7 @@ cc_test(
     srcs = ["range_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1392,6 +1517,7 @@ cc_test(
     srcs = ["squared_difference_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1408,6 +1534,7 @@ cc_test(
         ":builtin_ops",
         ":kernel_util",
         ":subgraph_test_util",
+        ":test_main",
         ":test_util",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
@@ -1425,6 +1552,7 @@ cc_test(
         ":builtin_ops",
         ":kernel_util",
         ":subgraph_test_util",
+        ":test_main",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
@@ -1439,6 +1567,7 @@ cc_test(
     srcs = ["fill_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1450,9 +1579,10 @@ cc_test(
     srcs = ["unique_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1462,6 +1592,7 @@ cc_test(
     srcs = ["reverse_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1487,9 +1618,10 @@ cc_test(
     srcs = ["mirror_pad_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -1528,6 +1660,7 @@ cc_test(
     srcs = ["reverse_sequence_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1540,6 +1673,7 @@ cc_test(
     srcs = ["matrix_diag_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
@@ -1552,6 +1686,7 @@ cc_test(
     srcs = ["quantize_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:types",
@@ -1565,6 +1700,7 @@ cc_test(
     srcs = ["matrix_set_diag_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index db9433afc80..3811d6ac71f 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -623,195 +623,78 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-// Performs softmax along the input of size (input_size * batch_size).
-void Softmax(const float* in, const int input_size, const int batch_size,
-             const float beta, float* out) {
-  TF_LITE_ASSERT(input_size > 0);
-
-  // For each batch
-  for (int b = 0; b < batch_size; b++) {
-    // Find the max coeff.
-    float max_coeff = in[0];
-    for (int i = 1; i < input_size; i++) {
-      if (in[i] > max_coeff) max_coeff = in[i];
-    }
-
-    // Compute the normalized sum of exps.
-    float exp_sum = 0.0;
-    for (int i = 0; i < input_size; i++) {
-      out[i] = std::exp((in[i] - max_coeff) * beta);
-      exp_sum += out[i];
-    }
-
-    // Divide by the sum of exps.
-    float reciprocal_sum_exp = 1.f / exp_sum;
-    for (int i = 0; i < input_size; i++) {
-      out[i] *= reciprocal_sum_exp;
-    }
-
-    // Advance in and out pointers for the next batch.
-    in += input_size;
-    out += input_size;
+TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
+                          TfLiteTensor* output, TfLiteSoftmaxParams* params) {
+  switch (NumDimensions(input)) {
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+      SoftmaxParams op_params;
+      op_params.beta = params->beta;
+      optimized_ops::Softmax(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    default:
+      context->ReportError(
+          context,
+          "Only 1D, 2D, 3D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
+      return kTfLiteError;
   }
 }
 
-// Takes a 1D tensor and performs softmax along it.
-void Softmax1DFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                    TfLiteSoftmaxParams* params) {
-  const int input_size = input->dims->data[0];
-  Softmax(input->data.f, input_size, 1, params->beta, output->data.f);
+TfLiteStatus SoftmaxQuantizedUint8(TfLiteContext* context,
+                                   const TfLiteTensor* input,
+                                   TfLiteTensor* output,
+                                   TfLiteSoftmaxParams* params, OpData* data) {
+  switch (NumDimensions(input)) {
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+      SoftmaxParams op_params;
+      op_params.input_multiplier = data->input_multiplier;
+      op_params.input_left_shift = data->input_left_shift;
+      op_params.diff_min = data->diff_min;
+      optimized_ops::Softmax(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      return kTfLiteOk;
+    default:
+      context->ReportError(
+          context,
+          "Only 1D, 2D, 3D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
+      return kTfLiteError;
+  }
 }
 
-// Takes a 2D tensor and perform softmax along the last dimension.
-void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                    TfLiteSoftmaxParams* params) {
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  Softmax(input->data.f, input_size, batch_size, params->beta, output->data.f);
-}
-
-// Takes a 3D tensor and perform softmax along the last dimension.
-void Softmax3DFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                    TfLiteSoftmaxParams* params) {
-  const int batch_size = input->dims->data[0];
-  const int intermediate_size = input->dims->data[1];
-  const int input_size = input->dims->data[2];
-  SoftmaxParams op_params;
-  op_params.beta = params->beta;
-  optimized_ops::Softmax(
-      op_params, GetTensorShape({batch_size, intermediate_size, 1, input_size}),
-      GetTensorData<float>(input),
-      GetTensorShape({batch_size, intermediate_size, 1, input_size}),
-      GetTensorData<float>(output));
-}
-
-void Softmax1DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
-                             TfLiteSoftmaxParams* params, OpData* data) {
-  // TODO(ahentz): this is arguably a dirty trick. Since the implementation
-  // always traverses the last dimension of a 4D tensor, we will pretend our 1D
-  // tensor is 4D in a special way. We will convert a (Y) shape into a (1,
-  // 1, 1, Y) shape.
-  const int input_size = input->dims->data[0];
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  optimized_ops::Softmax(op_params, GetTensorShape({1, 1, 1, input_size}),
-                         GetTensorData<uint8_t>(input),
-                         GetTensorShape({1, 1, 1, input_size}),
-                         GetTensorData<uint8_t>(output));
-}
-void Softmax2DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
-                             TfLiteSoftmaxParams* params, OpData* data) {
-  // TODO(ahentz): this is arguably a dirty trick. Since the implementation
-  // always traverses the last dimension of a 4D tensor, we will pretend our 2D
-  // tensor is 4D in a special way. We will convert a (X, Y) shape into a (X,
-  // 1, 1, Y) shape.
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  optimized_ops::Softmax(op_params,
-                         GetTensorShape({batch_size, 1, 1, input_size}),
-                         GetTensorData<uint8_t>(input),
-                         GetTensorShape({batch_size, 1, 1, input_size}),
-                         GetTensorData<uint8_t>(output));
-}
-
-void Softmax3DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
-                             TfLiteSoftmaxParams* params, OpData* data) {
-  const int batch_size = input->dims->data[0];
-  const int intermediate_size = input->dims->data[1];
-  const int input_size = input->dims->data[2];
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  optimized_ops::Softmax(
-      op_params, GetTensorShape({batch_size, intermediate_size, 1, input_size}),
-      GetTensorData<uint8_t>(input),
-      GetTensorShape({batch_size, intermediate_size, 1, input_size}),
-      GetTensorData<uint8_t>(output));
-}
-
-// Takes a 4D tensor and perform softmax along the forth dimension.
-void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                    TfLiteSoftmaxParams* params) {
-  SoftmaxParams op_params;
-  op_params.beta = params->beta;
-  optimized_ops::Softmax(op_params, GetTensorShape(input),
-                         GetTensorData<float>(input), GetTensorShape(output),
-                         GetTensorData<float>(output));
-}
-
-void Softmax4DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
-                             TfLiteSoftmaxParams* params, OpData* data) {
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  optimized_ops::Softmax(op_params, GetTensorShape(input),
-                         GetTensorData<uint8_t>(input), GetTensorShape(output),
-                         GetTensorData<uint8_t>(output));
-}
-
-// TODO(jianlijianli): Try merging Softmax<n>DQuantizedInt8 with
-// Softmax<n>DQuantized, which needs a larger refactor.
-void Softmax1DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
-                            TfLiteSoftmaxParams* params, OpData* data) {
-  const int input_size = input->dims->data[0];
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  optimized_integer_ops::Softmax(
-      op_params, GetTensorShape({1, 1, 1, input_size}),
-      GetTensorData<int8_t>(input), GetTensorShape({1, 1, 1, input_size}),
-      GetTensorData<int8_t>(output));
-}
-
-void Softmax2DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
-                            TfLiteSoftmaxParams* params, OpData* data) {
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  optimized_integer_ops::Softmax(op_params,
-                                 GetTensorShape({batch_size, 1, 1, input_size}),
-                                 GetTensorData<int8_t>(input),
-                                 GetTensorShape({batch_size, 1, 1, input_size}),
-                                 GetTensorData<int8_t>(output));
-}
-
-void Softmax3DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
-                            TfLiteSoftmaxParams* params, OpData* data) {
-  const int batch_size = input->dims->data[0];
-  const int intermediate_size = input->dims->data[1];
-  const int input_size = input->dims->data[2];
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  optimized_integer_ops::Softmax(
-      op_params, GetTensorShape({batch_size, intermediate_size, 1, input_size}),
-      GetTensorData<int8_t>(input),
-      GetTensorShape({batch_size, intermediate_size, 1, input_size}),
-      GetTensorData<int8_t>(output));
-}
-
-void Softmax4DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
-                            TfLiteSoftmaxParams* params, OpData* data) {
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  optimized_integer_ops::Softmax(
-      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-      GetTensorShape(output), GetTensorData<int8_t>(output));
+TfLiteStatus SoftmaxQuantizedInt8(TfLiteContext* context,
+                                  const TfLiteTensor* input,
+                                  TfLiteTensor* output,
+                                  TfLiteSoftmaxParams* params, OpData* data) {
+  switch (NumDimensions(input)) {
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+      SoftmaxParams op_params;
+      op_params.input_multiplier = data->input_multiplier;
+      op_params.input_left_shift = data->input_left_shift;
+      op_params.diff_min = data->diff_min;
+      optimized_integer_ops::Softmax(
+          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+          GetTensorShape(output), GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    default:
+      context->ReportError(
+          context,
+          "Only 1D, 2D, 3D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
+      return kTfLiteError;
+  }
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
@@ -825,76 +708,19 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   // dimensions.
   switch (input->type) {
     case kTfLiteFloat32: {
-      if (NumDimensions(input) == 1) {
-        Softmax1DFloat(input, output, params);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 2) {
-        Softmax2DFloat(input, output, params);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 3) {
-        Softmax3DFloat(input, output, params);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 4) {
-        Softmax4DFloat(input, output, params);
-        return kTfLiteOk;
-      }
-      context->ReportError(
-          context, "Only 1D, 2D and 4D tensors supported currently, got %dD.",
-          NumDimensions(input));
-      return kTfLiteError;
+      return SoftmaxFloat(context, input, output, params);
     }
     case kTfLiteUInt8: {
-      if (NumDimensions(input) == 1) {
-        Softmax1DQuantizedUint8(input, output, params, data);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 2) {
-        Softmax2DQuantizedUint8(input, output, params, data);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 3) {
-        Softmax3DQuantizedUint8(input, output, params, data);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 4) {
-        Softmax4DQuantizedUint8(input, output, params, data);
-        return kTfLiteOk;
-      }
-      context->ReportError(
-          context, "Only 2D and 4D tensors supported currently, got %dD.",
-          NumDimensions(input));
-      return kTfLiteError;
+      return SoftmaxQuantizedUint8(context, input, output, params, data);
     }
     case kTfLiteInt8: {
-      if (NumDimensions(input) == 1) {
-        Softmax1DQuantizedInt8(input, output, params, data);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 2) {
-        Softmax2DQuantizedInt8(input, output, params, data);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 3) {
-        Softmax3DQuantizedInt8(input, output, params, data);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 4) {
-        Softmax4DQuantizedInt8(input, output, params, data);
-        return kTfLiteOk;
-      }
-      context->ReportError(
-          context,
-          "Only 4D tensors supported currently for Int8 kernels, got %dD.",
-          NumDimensions(input));
-      return kTfLiteError;
+      return SoftmaxQuantizedInt8(context, input, output, params, data);
     }
 
     default:
       context->ReportError(
-          context, "Only float32 and uint8_t are supported currently, got %s.",
+          context,
+          "Only float32, uint8_t and Int8_t are supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 923f7d14e48..16e5d2db258 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -1075,9 +1075,3 @@ TEST(FloatActivationsOpTest, LeakyRelu) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/add_n_test.cc b/tensorflow/lite/kernels/add_n_test.cc
index ee9477d2ff1..ac6ccec2b66 100644
--- a/tensorflow/lite/kernels/add_n_test.cc
+++ b/tensorflow/lite/kernels/add_n_test.cc
@@ -90,9 +90,3 @@ TEST(IntegerAddNOpModel, AddMultipleTensors) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index 2904f4a11a9..9449981cb89 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -419,8 +419,3 @@ TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt8) {
 
 }  // namespace
 }  // namespace tflite
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/arg_min_max_test.cc b/tensorflow/lite/kernels/arg_min_max_test.cc
index b5cacef5e22..e532ef5a105 100644
--- a/tensorflow/lite/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/kernels/arg_min_max_test.cc
@@ -269,9 +269,3 @@ TEST(ArgMinOpTest, GetMinArgAxis64) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/audio_spectrogram_test.cc b/tensorflow/lite/kernels/audio_spectrogram_test.cc
index 527af2767b1..d19877e17a5 100644
--- a/tensorflow/lite/kernels/audio_spectrogram_test.cc
+++ b/tensorflow/lite/kernels/audio_spectrogram_test.cc
@@ -114,9 +114,3 @@ TEST(SpectrogramOpTest, StrideTest) {
 }  // namespace custom
 }  // namespace ops
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/basic_rnn_test.cc b/tensorflow/lite/kernels/basic_rnn_test.cc
index 9eb20444a6d..b9c251ce044 100644
--- a/tensorflow/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/lite/kernels/basic_rnn_test.cc
@@ -340,9 +340,3 @@ TEST(HybridRnnOpTest, BlackBoxTestInt8) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index bd806b55ca4..649fd5f0f4c 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -167,9 +167,3 @@ TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index 89da2974b70..9c397fefa9f 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -2963,9 +2963,3 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index 9b61f8238b5..eb7cb0b6d7f 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -1194,9 +1194,3 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputInputZeros) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index acdc331a7ea..6bad3d6e7b3 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -142,8 +142,3 @@ TEST(CastOpModel, CastComplex64ToComplex64) {
 
 }  // namespace
 }  // namespace tflite
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/ceil_test.cc b/tensorflow/lite/kernels/ceil_test.cc
index e1201050827..36486087fcf 100644
--- a/tensorflow/lite/kernels/ceil_test.cc
+++ b/tensorflow/lite/kernels/ceil_test.cc
@@ -75,9 +75,3 @@ TEST(CeilOpTest, MultiDims) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 3f950a32205..38e986a7a91 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -700,9 +700,3 @@ TEST(ComparisonsTest, QuantizedInt8LessEqualWithBroadcast) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index f3eb4ab995c..8448c6ed4b3 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -406,9 +406,3 @@ TEST(ConcatenationOpTest, TwoInputsTwoAxesNegativeAxesNonQuantized) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index c0b64d6c225..835c274933b 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -295,82 +295,14 @@ TEST_P(ConvolutionOpTest, ActivationRelu6Test) {
                              }));
 }
 
-TEST_P(ConvolutionOpTest, ActivationTanhTest) {
+TEST_P(ConvolutionOpTest, StrideTest) {
   ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
                        {TensorType_FLOAT32, {3, 2, 2, 1}},
                        {TensorType_FLOAT32, {}},
-                       /*stride_width=*/2,
-                       /*stride_height=*/2,
+                       /*stride_width=*/1,
+                       /*stride_height=*/1,
                        /*Padding=*/Padding_VALID,
-                       /*ActivationFunctionType=*/ActivationFunctionType_TANH);
-
-  m.SetInput({
-      // First batch
-      1, 1, 1, 1,  // row = 1
-      2, 2, 2, 2,  // row = 2
-      // Second batch
-      1, 2, 3, 4,  // row = 1
-      1, 2, 3, 4,  // row = 2
-  });
-  m.SetFilter({
-      1, 2, 3, 4,    // first 2x2 filter
-      -1, 1, -1, 1,  // second 2x2 filter
-      -1, -1, 1, 1,  // third 2x2 filter
-  });
-  m.SetBias({1, 2, 3});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 18, 2, 5,  // first batch, left
-                                 18, 2, 5,  // first batch, right
-                                 17, 4, 3,  // second batch, left
-                                 37, 4, 3,  // second batch, right
-                             }));
-}
-
-TEST_P(ConvolutionOpTest, ActivationSignTest) {
-  ConvolutionOpModel m(
-      GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
-      {TensorType_FLOAT32, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}},
-      /*stride_width=*/2,
-      /*stride_height=*/2,
-      /*Padding=*/Padding_VALID,
-      /*ActivationFunctionType=*/ActivationFunctionType_SIGN_BIT);
-
-  m.SetInput({
-      // First batch
-      1, 1, 1, 1,  // row = 1
-      2, 2, 3, 2,  // row = 2
-      // Second batch
-      1, 2, 3, 4,  // row = 1
-      1, 2, 4, 4,  // row = 2
-  });
-  m.SetFilter({
-      1, 2, 3, 4,    // first 2x2 filter
-      -1, 1, -1, 1,  // second 2x2 filter
-      -1, -1, 1, 1,  // third 2x2 filter
-  });
-  m.SetBias({1, 2, 3});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 18, 2, 5,  // first batch, left
-                                 21, 1, 6,  // first batch, right
-                                 17, 4, 3,  // second batch, left
-                                 40, 3, 4,  // second batch, right
-                             }));
-}
-
-TEST_P(ConvolutionOpTest, StrideTest) {
-  ConvolutionOpModel m(
-      GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
-      {TensorType_FLOAT32, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}},
-      /*stride_width=*/1,
-      /*stride_height=*/1,
-      /*Padding=*/Padding_VALID,
-      /*ActivationFunctionType=*/ActivationFunctionType_SIGN_BIT);
+                       /*ActivationFunctionType=*/ActivationFunctionType_NONE);
 
   m.SetInput({
       // First batch
@@ -400,13 +332,13 @@ TEST_P(ConvolutionOpTest, StrideTest) {
 }
 
 TEST_P(ConvolutionOpTest, PaddingTest) {
-  ConvolutionOpModel m(
-      GetRegistration(), {TensorType_FLOAT32, {1, 2, 4, 1}},
-      {TensorType_FLOAT32, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}},
-      /*stride_width=*/1,
-      /*stride_height=*/1,
-      /*Padding=*/Padding_SAME,
-      /*ActivationFunctionType=*/ActivationFunctionType_SIGN_BIT);
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 2, 4, 1}},
+                       {TensorType_FLOAT32, {3, 2, 2, 1}},
+                       {TensorType_FLOAT32, {}},
+                       /*stride_width=*/1,
+                       /*stride_height=*/1,
+                       /*Padding=*/Padding_SAME,
+                       /*ActivationFunctionType=*/ActivationFunctionType_NONE);
 
   m.SetInput({
       1, 1, 1, 1,  // row = 1
@@ -1412,9 +1344,3 @@ INSTANTIATE_TEST_SUITE_P(
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index eccf69f19d3..6ede8d2fc49 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -92,6 +92,7 @@ void Gemm(const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
           const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
           const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
           CpuBackendContext* context) {
+  gemmlowp::ScopedProfilingLabel label("cpu_backend_gemm::Gemm");
   ValidateParams(lhs_params, rhs_params, dst_params, params);
   if (dst_params.cols == 1) {
     // GEMV case: try a custom fast GEMV path.
@@ -100,6 +101,7 @@ void Gemm(const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
       return;
     }
   }
+  gemmlowp::ScopedProfilingLabel label2("cpu_backend_gemm::Gemm: general GEMM");
   GemmImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
            quantization_flavor>::Run(lhs_params, lhs_data, rhs_params, rhs_data,
                                      dst_params, dst_data, params, context);
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
index 3b686e5a1f7..017f1660e8c 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -144,6 +144,7 @@ bool CustomGemv(
     const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
     const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
     CpuBackendContext* context) {
+  gemmlowp::ScopedProfilingLabel label("cpu_backend_gemm::Gemm: CustomGemv");
   using Impl = CustomGemvImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar,
                               quantization_flavor>;
   if (lhs_params.rows < Impl::kKernelRows) {
@@ -186,8 +187,8 @@ bool CustomGemv(
 // Some NEON helper functions used by CustomGemvImpl specializations below,
 // allowing for some type genericity in them.
 
-inline int16x8x2_t LoadAndSubtractZeroPoint(const std::uint8_t* src,
-                                            std::uint8_t zero_point) {
+inline int16x8x2_t Load16AndSubtractZeroPoint(const std::uint8_t* src,
+                                              std::uint8_t zero_point) {
   uint8x16_t src_u8 = vld1q_u8(src);
   int16x8_t src_s16_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_u8)));
   int16x8_t src_s16_1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_u8)));
@@ -198,8 +199,8 @@ inline int16x8x2_t LoadAndSubtractZeroPoint(const std::uint8_t* src,
   return result;
 }
 
-inline int16x8x2_t LoadAndSubtractZeroPoint(const std::int8_t* src,
-                                            std::int8_t zero_point) {
+inline int16x8x2_t Load16AndSubtractZeroPoint(const std::int8_t* src,
+                                              std::int8_t zero_point) {
   int8x16_t src_s8 = vld1q_s8(src);
   int16x8_t src_s16_0 = vmovl_s8(vget_low_s8(src_s8));
   int16x8_t src_s16_1 = vmovl_s8(vget_high_s8(src_s8));
@@ -210,6 +211,22 @@ inline int16x8x2_t LoadAndSubtractZeroPoint(const std::int8_t* src,
   return result;
 }
 
+inline int16x8_t Load8AndSubtractZeroPoint(const std::uint8_t* src,
+                                           std::uint8_t zero_point) {
+  uint8x8_t src_u8 = vld1_u8(src);
+  int16x8_t src_s16 = vreinterpretq_s16_u16(vmovl_u8(src_u8));
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  return vsubq_s16(src_s16, zero_point_vec);
+}
+
+inline int16x8_t Load8AndSubtractZeroPoint(const std::int8_t* src,
+                                           std::int8_t zero_point) {
+  int8x8_t src_s8 = vld1_s8(src);
+  int16x8_t src_s16 = vmovl_s8(src_s8);
+  int16x8_t zero_point_vec = vdupq_n_s16(zero_point);
+  return vsubq_s16(src_s16, zero_point_vec);
+}
+
 inline void ClampAndStore(int32x4_t src, std::uint8_t clamp_min,
                           std::uint8_t clamp_max, std::uint8_t* dst) {
   // Narrow values down to 16 bit signed.
@@ -288,11 +305,12 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
       const MatrixParams<RhsScalar>& rhs_params,
       const MatrixParams<DstScalar>& dst_params,
       const GemmParams<std::int32_t, DstScalar, quantization_flavor>& params) {
-    // There are no further requirements on the applicability of this kernel,
-    // beyond the left-hand-side matrix having at least kKernelRows rows,
-    // and the type requirements implied in this template partial
-    // specialization.
-    return true;
+    // The kernel processes at least 8 LHS columns at once to fill NEON
+    // registers. The leftovers-handling code at the end works by loading a
+    // partially overlapping final register by walking back by a few (<8) values
+    // to avoid running past the row's end. This relies on there being
+    // at least 8 LHS columns.
+    return lhs_params.cols >= 8;
   }
 
   static void Run(
@@ -311,6 +329,27 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
       // `row`.
       row = std::min(row, row_end - kKernelRows);
       const LhsScalar* filter_ptr = lhs_data + row * lhs_params.cols;
+
+      static constexpr int kCacheLineSize = 64;
+      for (int k = 0; k < rhs_params.rows;
+           k += kCacheLineSize / sizeof(RhsScalar)) {
+        optimized_ops_preload_l1_keep(rhs_data + k);
+      }
+
+      // kPreloadAhead is empirically determined.
+      // End-to-end latency (ms) on mobilenet_v2_0.35_96_8bit, 1 thread,
+      // on Qualcomm S855:
+      //
+      // kPreloadAhead | big core | little core
+      // --------------+----------+------------
+      // 64            | 1.26     | 5.45
+      // 128           | 1.23     | 5.01
+      // 256           | 1.18     | 4.9
+      // 512           | 1.18     | 5.45
+      // 1024          | 1.18     | 6.5
+      // no prefetch   | 1.25     | 8.1
+      static constexpr int kPreloadAhead = 256;
+
       // 4 accumulator registers, one for each row being processed.
       // Each has 4 int32 lanes that corresponds to columns modulo 4, and
       // will need to be horizontally reduced at the end.
@@ -322,16 +361,28 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
       // As much as possible, handle 16 columns of the left-hand side matrix
       // at a time. This allows for decent NEON implementation.
       for (; in <= lhs_params.cols - 16; in += 16) {
+        const LhsScalar* local_filter_ptr = filter_ptr;
         int16x8x2_t input_val =
-            LoadAndSubtractZeroPoint(rhs_data + in, rhs_params.zero_point);
-        int16x8x2_t filter_val_0 = LoadAndSubtractZeroPoint(
-            filter_ptr + 0 * lhs_params.cols, lhs_params.zero_point);
-        int16x8x2_t filter_val_1 = LoadAndSubtractZeroPoint(
-            filter_ptr + 1 * lhs_params.cols, lhs_params.zero_point);
-        int16x8x2_t filter_val_2 = LoadAndSubtractZeroPoint(
-            filter_ptr + 2 * lhs_params.cols, lhs_params.zero_point);
-        int16x8x2_t filter_val_3 = LoadAndSubtractZeroPoint(
-            filter_ptr + 3 * lhs_params.cols, lhs_params.zero_point);
+            Load16AndSubtractZeroPoint(rhs_data + in, rhs_params.zero_point);
+        int16x8x2_t filter_val_0 =
+            Load16AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(LhsScalar));
+        local_filter_ptr += lhs_params.cols;
+        int16x8x2_t filter_val_1 =
+            Load16AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(LhsScalar));
+        local_filter_ptr += lhs_params.cols;
+        int16x8x2_t filter_val_2 =
+            Load16AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(LhsScalar));
+        local_filter_ptr += lhs_params.cols;
+        int16x8x2_t filter_val_3 =
+            Load16AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(LhsScalar));
         filter_ptr += 16;
         acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0.val[0]),
                          vget_low_s16(input_val.val[0]));
@@ -366,27 +417,109 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
         acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3.val[1]),
                          vget_high_s16(input_val.val[1]));
       }
-      // Leftovers: fewer than 16 columns remain. Very slow code, could be
-      // improved upon if critical in some application.
+      // Less that 16 values remain. Try to handle 8 more.
+      if (in <= lhs_params.cols - 8) {
+        int16x8_t input_val =
+            Load8AndSubtractZeroPoint(rhs_data + in, rhs_params.zero_point);
+        int16x8_t filter_val_0 = Load8AndSubtractZeroPoint(
+            filter_ptr + 0 * lhs_params.cols, lhs_params.zero_point);
+        int16x8_t filter_val_1 = Load8AndSubtractZeroPoint(
+            filter_ptr + 1 * lhs_params.cols, lhs_params.zero_point);
+        int16x8_t filter_val_2 = Load8AndSubtractZeroPoint(
+            filter_ptr + 2 * lhs_params.cols, lhs_params.zero_point);
+        int16x8_t filter_val_3 = Load8AndSubtractZeroPoint(
+            filter_ptr + 3 * lhs_params.cols, lhs_params.zero_point);
+        filter_ptr += 8;
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0),
+                         vget_low_s16(input_val));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1),
+                         vget_low_s16(input_val));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2),
+                         vget_low_s16(input_val));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3),
+                         vget_low_s16(input_val));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                         vget_high_s16(input_val));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                         vget_high_s16(input_val));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                         vget_high_s16(input_val));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                         vget_high_s16(input_val));
+        in += 8;
+      }
+      // Less than 8 values remain. Handle the remaining values
+      // in one more copy of the above code handling 8, where we
+      // walk back a few values to be able to load 8 values without
+      // overrunning the buffer. This is where we make use of the requirement
+      // (see IsSupportedGivenSufficientlyManyRows) that there at least
+      // 8 LHS columns.
       if (in < lhs_params.cols) {
-        int32 buf[16];
-        vst1q_s32(buf + 0, acc0);
-        vst1q_s32(buf + 4, acc1);
-        vst1q_s32(buf + 8, acc2);
-        vst1q_s32(buf + 12, acc3);
-        for (; in < lhs_params.cols; in++) {
-          int lane = (in + 16 - lhs_params.cols) % 4;
-          const int32 input_val = rhs_data[in] - rhs_params.zero_point;
-          for (int k = 0; k < 4; k++) {
-            int32 filter_val = lhs_data[in + (row + k) * lhs_params.cols] -
-                               lhs_params.zero_point;
-            buf[lane + 4 * k] += filter_val * input_val;
-          }
+        // `back` is how many entries to walk back by.
+        // Its value is necessarily between 1 and 7.
+        const int back = in + 8 - lhs_params.cols;
+        TFLITE_DCHECK_GE(back, 1);
+        TFLITE_DCHECK_LE(back, 7);
+        // Load 8 values as usual.
+        int16x8_t input_val = Load8AndSubtractZeroPoint(
+            rhs_data + lhs_params.cols - 8, rhs_params.zero_point);
+        const LhsScalar* local_filter_ptr = filter_ptr - back;
+        filter_ptr += lhs_params.cols - in;
+        int16x8_t filter_val_0 =
+            Load8AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        local_filter_ptr += lhs_params.cols;
+        int16x8_t filter_val_1 =
+            Load8AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        local_filter_ptr += lhs_params.cols;
+        int16x8_t filter_val_2 =
+            Load8AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        local_filter_ptr += lhs_params.cols;
+        int16x8_t filter_val_3 =
+            Load8AndSubtractZeroPoint(local_filter_ptr, lhs_params.zero_point);
+        // Now zero out the `back` first entries of input_val.
+        // vsetq_lane_s16 takes a literal index, so we need unrolled code.
+        switch (back) {
+          case 7:
+            input_val = vsetq_lane_s16(0, input_val, 6);
+            [[clang::fallthrough]];
+          case 6:
+            input_val = vsetq_lane_s16(0, input_val, 5);
+            [[clang::fallthrough]];
+          case 5:
+            input_val = vsetq_lane_s16(0, input_val, 4);
+            [[clang::fallthrough]];
+          case 4:
+            input_val = vsetq_lane_s16(0, input_val, 3);
+            [[clang::fallthrough]];
+          case 3:
+            input_val = vsetq_lane_s16(0, input_val, 2);
+            [[clang::fallthrough]];
+          case 2:
+            input_val = vsetq_lane_s16(0, input_val, 1);
+            [[clang::fallthrough]];
+          default:
+            input_val = vsetq_lane_s16(0, input_val, 0);
         }
-        acc0 = vld1q_s32(buf + 0);
-        acc1 = vld1q_s32(buf + 4);
-        acc2 = vld1q_s32(buf + 8);
-        acc3 = vld1q_s32(buf + 12);
+        // Multiply-accumulate 8 values as usual. The `back` first lanes
+        // of filter_val_* are junk, but it doesn't matter since they get
+        // multiplied by the zeros that we just wrote in the corresponding
+        // lanes of input_val.
+        acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0),
+                         vget_low_s16(input_val));
+        acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1),
+                         vget_low_s16(input_val));
+        acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2),
+                         vget_low_s16(input_val));
+        acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3),
+                         vget_low_s16(input_val));
+        acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                         vget_high_s16(input_val));
+        acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                         vget_high_s16(input_val));
+        acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                         vget_high_s16(input_val));
+        acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                         vget_high_s16(input_val));
       }
 
       // Horizontally reduce accumulators
@@ -484,11 +617,12 @@ struct CustomGemvImpl<float, float, float, float,
       const MatrixParams<float>& rhs_params,
       const MatrixParams<float>& dst_params,
       const GemmParams<float, float>& params) {
-    // There are no further requirements on the applicability of this kernel,
-    // beyond the left-hand-side matrix having at least kKernelRows rows,
-    // and the type requirements implied in this template partial
-    // specialization.
-    return true;
+    // The kernel processes 4 LHS columns at once to fill float32x4 registers.
+    // The leftovers-handling code at the end works by loading a partially
+    // overlapping final register by walking back by a few (<4) floats
+    // to avoid running past the row's end. This relies on there being
+    // at least 4 LHS columns.
+    return lhs_params.cols >= 4;
   }
   static void Run(const MatrixParams<float>& lhs_params, const float* lhs_data,
                   const MatrixParams<float>& rhs_params, const float* rhs_data,
@@ -505,6 +639,27 @@ struct CustomGemvImpl<float, float, float, float,
       // `row`.
       row = std::min(row, row_end - kKernelRows);
       const float* filter_ptr = lhs_data + row * lhs_params.cols;
+
+      static constexpr int kCacheLineSize = 64;
+      for (int k = 0; k < rhs_params.rows;
+           k += kCacheLineSize / sizeof(float)) {
+        optimized_ops_preload_l1_keep(rhs_data + k);
+      }
+
+      // kPreloadAhead is empirically determined.
+      // End-to-end latency (ms) on mobilenet_v2_0.35_96_float, 1 thread,
+      // on Qualcomm S855:
+      //
+      // kPreloadAhead | big core | little core
+      // --------------+----------+------------
+      // 64            | 2.4      | 15.2
+      // 128           | 2.15     | 12.9
+      // 256           | 2        | 12.9
+      // 512           | 2.08     | 13.3
+      // 1024          | 2.05     | 14.7
+      // no prefetch   | 2.1      | 28
+      static constexpr int kPreloadAhead = 256;
+
       // 4 accumulator registers, one for each row being processed.
       // Each has 4 float32 lanes that corresponds to columns modulo 4, and
       // will need to be horizontally reduced at the end.
@@ -517,36 +672,71 @@ struct CustomGemvImpl<float, float, float, float,
       // at a time. This allows for decent NEON implementation.
       for (; in <= lhs_params.cols - 4; in += 4) {
         float32x4_t input_val = vld1q_f32(rhs_data + in);
-        float32x4_t filter_val_0 = vld1q_f32(filter_ptr + 0 * lhs_params.cols);
-        float32x4_t filter_val_1 = vld1q_f32(filter_ptr + 1 * lhs_params.cols);
-        float32x4_t filter_val_2 = vld1q_f32(filter_ptr + 2 * lhs_params.cols);
-        float32x4_t filter_val_3 = vld1q_f32(filter_ptr + 3 * lhs_params.cols);
+        const float* local_filter_ptr = filter_ptr;
+        float32x4_t filter_val_0 = vld1q_f32(local_filter_ptr);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(float));
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_1 = vld1q_f32(local_filter_ptr);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(float));
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_2 = vld1q_f32(local_filter_ptr);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(float));
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_3 = vld1q_f32(local_filter_ptr);
+        optimized_ops_preload_l1_stream(local_filter_ptr +
+                                        kPreloadAhead / sizeof(float));
         filter_ptr += 4;
         acc0 = mul_add(acc0, filter_val_0, input_val);
         acc1 = mul_add(acc1, filter_val_1, input_val);
         acc2 = mul_add(acc2, filter_val_2, input_val);
         acc3 = mul_add(acc3, filter_val_3, input_val);
       }
-      // Leftovers: fewer than 4 columns remain. Very slow code, could be
-      // improved upon if critical in some application.
+      // Less than 4 values remain. Handle the remaining values
+      // in one more copy of the above code handling 4, where we
+      // walk back a few values to be able to load 4 values without
+      // overrunning the buffer. This is where we make use of the requirement
+      // (see IsSupportedGivenSufficientlyManyRows) that there at least
+      // 4 LHS columns.
       if (in < lhs_params.cols) {
-        float buf[16];
-        vst1q_f32(buf + 0, acc0);
-        vst1q_f32(buf + 4, acc1);
-        vst1q_f32(buf + 8, acc2);
-        vst1q_f32(buf + 12, acc3);
-        for (; in < lhs_params.cols; in++) {
-          int lane = (in + 4 - lhs_params.cols) % 4;
-          const float input_val = rhs_data[in];
-          for (int k = 0; k < 4; k++) {
-            float filter_val = lhs_data[in + (row + k) * lhs_params.cols];
-            buf[lane + 4 * k] += filter_val * input_val;
-          }
+        // `back` is how many entries to walk back by.
+        // Its value is necessarily between 1 and 3.
+        const int back = in + 4 - lhs_params.cols;
+        TFLITE_DCHECK_GE(back, 1);
+        TFLITE_DCHECK_LE(back, 3);
+        // Load 4 values as usual.
+        float32x4_t input_val = vld1q_f32(rhs_data + lhs_params.cols - 4);
+        const float* local_filter_ptr = filter_ptr - back;
+        filter_ptr += lhs_params.cols - in;
+        float32x4_t filter_val_0 = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_1 = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_2 = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += lhs_params.cols;
+        float32x4_t filter_val_3 = vld1q_f32(local_filter_ptr);
+        // Now zero out the `back` first entries of input_val.
+        // vsetq_lane_f32 takes a literal index, so we need unrolled code.
+        switch (back) {
+          case 3:
+            input_val = vsetq_lane_f32(0, input_val, 2);
+            [[clang::fallthrough]];
+          case 2:
+            input_val = vsetq_lane_f32(0, input_val, 1);
+            [[clang::fallthrough]];
+          default:
+            input_val = vsetq_lane_f32(0, input_val, 0);
         }
-        acc0 = vld1q_f32(buf + 0);
-        acc1 = vld1q_f32(buf + 4);
-        acc2 = vld1q_f32(buf + 8);
-        acc3 = vld1q_f32(buf + 12);
+        // Multiply-accumulate 4 values as usual. The `back` first lanes
+        // of filter_val_* are junk, but it doesn't matter since they get
+        // multiplied by the zeros that we just wrote in the corresponding
+        // lanes of input_val.
+        acc0 = mul_add(acc0, filter_val_0, input_val);
+        acc1 = mul_add(acc1, filter_val_1, input_val);
+        acc2 = mul_add(acc2, filter_val_2, input_val);
+        acc3 = mul_add(acc3, filter_val_3, input_val);
       }
 
       // Horizontally reduce accumulators
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
index 9a78ddd8c23..6bdbf81ced3 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TFLITE_WITH_RUY
+
 #include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
 
 // See b/131835803: in TFLite code, because eigen_spatial_convolutions.h does
@@ -75,3 +77,5 @@ void GemmImplUsingEigen::Run(
 }  // namespace detail
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
+
+#endif  // not TFLITE_WITH_RUY
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
index 8a87528e6b9..bd4733dcfae 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
 
+#ifndef TFLITE_WITH_RUY
+
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 
@@ -35,4 +37,6 @@ struct GemmImplUsingEigen {
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
+#endif  // not TFLITE_WITH_RUY
+
 #endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
index 7a659c3e117..3c63443ecf4 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
 
+#ifndef TFLITE_WITH_RUY
+
 #include <cstdint>
 #include <type_traits>
 
@@ -176,4 +178,6 @@ struct GemmImplUsingGemmlowp<LhsScalar, RhsScalar, AccumScalar, DstScalar,
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
+#endif  // not TFLITE_WITH_RUY
+
 #endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index 54ce7362b30..c1f1ee54d33 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -299,66 +299,6 @@ TEST_P(DepthwiseConvolutionOpTest, ActivationRelu6Test) {
                              }));
 }
 
-TEST_P(DepthwiseConvolutionOpTest, ActivationTanhTest) {
-  DepthwiseConvolutionOpModel m(
-      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
-      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
-      Padding_VALID,
-      /*stride_width*/ 1,
-      /*stride_height*/ 1,
-      /*ActivationFunctionType*/ ActivationFunctionType_TANH);
-
-  m.SetInput({
-      1, 2, 7, 8,    // column 1
-      3, 4, 9, 10,   // column 2
-      5, 6, 11, 12,  // column 3
-  });
-  m.SetFilter({
-      1, 2, 3, 4,        //
-      -9, 10, -11, 12,   //
-      5, 6, 7, 8,        //
-      13, -14, 15, -16,  //
-  });
-  m.SetBias({1, 2, 3, 4});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 71, -34, 99, -20,  //
-                                 91, -26, 127, -4,  //
-                             }));
-}
-
-TEST_P(DepthwiseConvolutionOpTest, ActivationSignTest) {
-  DepthwiseConvolutionOpModel m(
-      GetRegistration(), {TensorType_FLOAT32, {1, 3, 2, 2}},
-      {TensorType_FLOAT32, {1, 2, 2, 4}}, {TensorType_FLOAT32, {}},
-      Padding_VALID,
-      /*stride_width*/ 1,
-      /*stride_height*/ 1,
-      /*ActivationFunctionType*/ ActivationFunctionType_SIGN_BIT);
-
-  m.SetInput({
-      1, 2, 7, 8,    // column 1
-      3, 4, 9, 10,   // column 2
-      5, 6, 10, 11,  // column 3
-  });
-  m.SetFilter({
-      1, 2, 3, 4,        //
-      -9, 10, -11, 12,   //
-      5, 6, 7, 8,        //
-      13, -14, 15, -16,  //
-  });
-  m.SetBias({1, 2, 3, 4});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 71, -34, 99, -20,  //
-                                 78, -12, 112, 12,  //
-                             }));
-}
-
 void StrideTest(TfLiteRegistration* registration, int num_thread) {
   DepthwiseConvolutionOpModel m(
       registration, {TensorType_FLOAT32, {1, 3, 2, 2}},
@@ -1843,11 +1783,53 @@ TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest, Simple3x3FilterTest) {
 
   // Invoke and verify output.
   m.Invoke();
-  printf("\n");
   EXPECT_THAT(m.GetDequantizedOutput(),
               ElementsAreArray(ArrayFloatNear({9, 18, 0, 0, 36, 54, 0, 0})));
 }
 
+TEST_P(PerChannelQuantizedDepthwiseConvolutionOpTest,
+       Simple3x3FilterPaddingSameTest) {
+  PerChannelQuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 3, 3, 8}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [1 * 3 * 3 * 8] as [input_channel, y, x, output_channel]
+       {1, 3, 3, 8},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/
+       {1, 2, 3, 4, 4, 3, 2, 1},
+       /*per_channel_quantization_offsets=*/{0, 0, 0, 0, 0, 0, 0, 0},
+       /*channel_index=*/3},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_SAME);
+  m.SetInput({// array of 9 x 8 => [1, 3, 3, 8]
+              1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+              0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
+              1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+              0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0});
+  m.SetFilter(
+      /*filter data*/
+      {// array of 9 x 8 => [1, 3, 3, 8]
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,
+       1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8});
+  m.SetBias({0, 0, 0, 0, 0, 0, 0, 0});
+
+  // Invoke and verify output.
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({
+          // array of 9 x 8 => [1, 3, 3, 8]
+          4,  8,  0, 0, 16, 24, 0, 0, 6,  12, 0, 0, 24, 36, 0, 0, 4,  8,  0, 0,
+          16, 24, 0, 0, 6,  12, 0, 0, 24, 36, 0, 0, 9,  18, 0, 0, 36, 54, 0, 0,
+          6,  12, 0, 0, 24, 36, 0, 0, 4,  8,  0, 0, 16, 24, 0, 0, 6,  12, 0, 0,
+          24, 36, 0, 0, 4,  8,  0, 0, 16, 24, 0, 0,
+      })));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
@@ -1863,9 +1845,3 @@ INSTANTIATE_TEST_SUITE_P(
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index 7f03c73c9c9..7c17cae7607 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+
 #include <string.h>
+
 #include <vector>
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -59,7 +63,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
 
   TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8 ||
-                              op_context.input->type == kTfLiteInt8);
+                              op_context.input->type == kTfLiteInt8 ||
+                              op_context.input->type == kTfLiteFloat16);
 
   op_context.output->type = kTfLiteFloat32;
   // If the input tensor is constant, we can persist the dequantized value in
@@ -96,6 +101,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTensorShape(op_context.output),
           GetTensorData<float>(op_context.output));
       break;
+    case kTfLiteFloat16: {
+      const Eigen::half* half_data = reinterpret_cast<const Eigen::half*>(
+          GetTensorData<TfLiteFloat16>(op_context.input));
+      reference_ops::Dequantize(GetTensorShape(op_context.input), half_data,
+                                GetTensorShape(op_context.output),
+                                GetTensorData<float>(op_context.output));
+      break;
+    }
     default:
       context->ReportError(context, "Type %d not supported.",
                            op_context.input->type);
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index 77254335fbd..b7555dd500f 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -75,9 +75,3 @@ TEST(DequantizeOpTest, INT8) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/detection_postprocess_test.cc b/tensorflow/lite/kernels/detection_postprocess_test.cc
index 250706bea75..2c8edcb8920 100644
--- a/tensorflow/lite/kernels/detection_postprocess_test.cc
+++ b/tensorflow/lite/kernels/detection_postprocess_test.cc
@@ -795,9 +795,3 @@ TEST(DetectionPostprocessOpTest, FloatTestwithNoBackgroudClassAndKeypoints) {
 }  // namespace custom
 }  // namespace ops
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/div_test.cc b/tensorflow/lite/kernels/div_test.cc
index 68a8855dd13..e742301b7a7 100644
--- a/tensorflow/lite/kernels/div_test.cc
+++ b/tensorflow/lite/kernels/div_test.cc
@@ -167,9 +167,3 @@ TEST(IntegerDivOpTest, WithBroadcast) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index 89f2a506f0c..ec00d3e071d 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -134,9 +134,3 @@ TEST(ElementWise, LogicalNot) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
index 0c555fdd7de..7f9cf19e197 100644
--- a/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
@@ -154,9 +154,3 @@ TEST(EmbeddingLookupOpTest, Indices3DTest) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index cf90ed08aa6..58e77afeeef 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -260,9 +260,3 @@ TEST(EmbeddingLookupHybridOpTest, Simple3DTestQuantized) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/exp_test.cc b/tensorflow/lite/kernels/exp_test.cc
index fa71fe351a4..4cb8a5524fa 100644
--- a/tensorflow/lite/kernels/exp_test.cc
+++ b/tensorflow/lite/kernels/exp_test.cc
@@ -62,9 +62,3 @@ TEST(ExpOpTest, FloatTest) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/expand_dims.cc b/tensorflow/lite/kernels/expand_dims.cc
index dd2479f34e6..63b8b288cf9 100644
--- a/tensorflow/lite/kernels/expand_dims.cc
+++ b/tensorflow/lite/kernels/expand_dims.cc
@@ -14,7 +14,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string.h>
+
 #include <vector>
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
@@ -27,7 +29,6 @@ namespace builtin {
 namespace expand_dims {
 constexpr int kInput = 0;
 constexpr int kAxis = 1;
-constexpr int kOutput = 0;
 
 namespace {
 TfLiteStatus ExpandTensorDim(TfLiteContext* context, const TfLiteTensor& input,
diff --git a/tensorflow/lite/kernels/expand_dims_test.cc b/tensorflow/lite/kernels/expand_dims_test.cc
index ea0c6c0fc83..388a8adea2c 100644
--- a/tensorflow/lite/kernels/expand_dims_test.cc
+++ b/tensorflow/lite/kernels/expand_dims_test.cc
@@ -75,9 +75,3 @@ TEST(ExpandDimsOpTest, DifferentAxis) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/fake_quant_test.cc b/tensorflow/lite/kernels/fake_quant_test.cc
index ce14703421e..3a2dc258fe2 100644
--- a/tensorflow/lite/kernels/fake_quant_test.cc
+++ b/tensorflow/lite/kernels/fake_quant_test.cc
@@ -104,9 +104,3 @@ TEST(FakeQuantOpTest, FloatNegativeRange16Test) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
index 08044d76f9d..b98d5b2870e 100644
--- a/tensorflow/lite/kernels/fill_test.cc
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -86,9 +86,3 @@ TEST(FillOpModel, FillOutputScalar) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/floor_div_test.cc b/tensorflow/lite/kernels/floor_div_test.cc
index b5694cc254a..b7dcb3babe2 100644
--- a/tensorflow/lite/kernels/floor_div_test.cc
+++ b/tensorflow/lite/kernels/floor_div_test.cc
@@ -114,9 +114,3 @@ TEST(FloorDivModel, BroadcastFloorDivFloat) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/floor_mod_test.cc b/tensorflow/lite/kernels/floor_mod_test.cc
index 9d78673f320..f96988a855a 100644
--- a/tensorflow/lite/kernels/floor_mod_test.cc
+++ b/tensorflow/lite/kernels/floor_mod_test.cc
@@ -126,9 +126,3 @@ TEST(FloorModModel, FloatBroadcastFloorMod) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/floor_test.cc b/tensorflow/lite/kernels/floor_test.cc
index 9bcbdba8a4f..e66158ba7ba 100644
--- a/tensorflow/lite/kernels/floor_test.cc
+++ b/tensorflow/lite/kernels/floor_test.cc
@@ -75,9 +75,3 @@ TEST(FloorOpTest, MultiDims) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 7d943fd0075..bca595eb836 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -209,11 +209,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Resize output.
-  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
-  output_size_array->data[0] = batch_size;
-  output_size_array->data[1] = num_units;
+  TfLiteIntArray* output_size_array = nullptr;
+  if (params->keep_num_dims) {
+    // When number of dimensions are kept the filter operates along the last
+    // dimenions. In other words, for an input tensor with shape
+    // [batch_size, ..., n_inputs] and a filter of shape [n_inputs, n_units]
+    // this Op produces an output of shape [batch_size, ..., n_units].
+    TF_LITE_ENSURE_EQ(context, input->dims->data[input->dims->size - 1],
+                      SizeOfDimension(filter, 1));
+    output_size_array = TfLiteIntArrayCopy(input->dims);
+    output_size_array->data[output_size_array->size - 1] = num_units;
+  } else {
+    // Otherwise, the output is (potentially flattened to) a 2-D matrix.
+    output_size_array = TfLiteIntArrayCreate(2);
+    output_size_array->data[0] = batch_size;
+    output_size_array->data[1] = num_units;
+  }
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
+
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 6727bd35bd8..83d119767e7 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -136,7 +136,7 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
   BaseFullyConnectedOpModel(
       TfLiteRegistration* registration, int units, int batches,
       const TensorData& input, const TensorData& output = {TensorType_FLOAT32},
-      bool bias_tensor_optional = false,
+      bool keep_num_dims = false, bool bias_tensor_optional = false,
       ActivationFunctionType activation_func = ActivationFunctionType_RELU,
       FullyConnectedOptionsWeightsFormat weights_format =
           FullyConnectedOptionsWeightsFormat_DEFAULT)
@@ -169,10 +169,11 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
       AddOutput({TensorType_UINT8, input.shape});
     }
 
-    SetBuiltinOp(
-        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
-        CreateFullyConnectedOptions(builder_, activation_func, weights_format)
-            .Union());
+    SetBuiltinOp(BuiltinOperator_FULLY_CONNECTED,
+                 BuiltinOptions_FullyConnectedOptions,
+                 CreateFullyConnectedOptions(builder_, activation_func,
+                                             weights_format, keep_num_dims)
+                     .Union());
     resolver_ = absl::make_unique<SingleOpResolver>(
         BuiltinOperator_FULLY_CONNECTED, registration);
     BuildInterpreter(
@@ -211,6 +212,7 @@ class FloatFullyConnectedOpModel : public BaseFullyConnectedOpModel {
   }
 
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 };
 
 class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
@@ -311,6 +313,7 @@ class HybridFullyConnectedOpModel : public SingleOpModel {
 
   void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
   int input_size() { return input_size_; }
   int num_units() { return units_; }
@@ -386,6 +389,7 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest) {
 
   m.Invoke();
 
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(2, 3));
   EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
 }
 
@@ -404,6 +408,7 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest2) {
 
   m.Invoke();
 
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(2, 1));
   EXPECT_THAT(m.GetOutput(), ElementsAre(11, 9));
 }
 
@@ -413,6 +418,7 @@ TEST(FloatFullyConnectedOpTest, SimpleTestNoBias) {
                                /*units=*/1, /*batches=*/2,
                                /*input=*/{TensorType_FLOAT32, {2, 2}},
                                /*output=*/{TensorType_FLOAT32},
+                               /*keep_num_dims=*/false,
                                /*bias_tensor_optional=*/true);
   m.SetWeights({
       2, 4,  // u = 0
@@ -425,6 +431,7 @@ TEST(FloatFullyConnectedOpTest, SimpleTestNoBias) {
 
   m.Invoke();
 
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(2, 1));
   EXPECT_THAT(m.GetOutput(), ElementsAre(10, 8));
 }
 
@@ -591,6 +598,7 @@ void SimpleTestQuantizedInt16OutputCase(
       /*input=*/
       {TensorType_UINT8, {batches, input_depth}, kInputMin, kInputMax},
       /*output=*/{TensorType_INT16, {}, kOutputMin, kOutputMax},
+      /*keep_num_dims=*/false,
       /*bias_tensor_optional=*/false,
       /*activation_func=*/ActivationFunctionType_NONE, weights_format);
 
@@ -773,12 +781,60 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) {
 
   m.Invoke();
 
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(2, 3));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({
                                  24, 25, 26,  // first batch
                                  58, 59, 60,  // second batch
                              }));
 }
 
+TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput4DOutput) {
+  // Note that it is not required that the first dimension be the number of
+  // batches. All we care is that the input can be evenly distributed in
+  // batches. In this case, we need the input to have multiples of '2'.
+  FloatFullyConnectedOpModel m(GetRegistration(),
+                               /*units=*/3, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {1, 2, 1, 10}},
+                               /*output=*/{TensorType_FLOAT32},
+                               /*keep_num_dims=*/true);
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // first batch
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // second batch
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 2, 1, 3));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 24, 25, 26,  // first batch
+                                 58, 59, 60,  // second batch
+                             }));
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInputInvalidShape) {
+  // Note that it is not required that the first dimension be the number of
+  // batches. But it is required that the last dimension is the 'input_dim'.
+  //
+  // For this particular test, it is required for the output to be reformattable
+  // into a shape of form {4, 1, 5, ?} but since the output size (the product of
+  // output dimensions: units times batches) is 6, this is not possible.
+  EXPECT_DEATH(FloatFullyConnectedOpModel m(
+                   GetRegistration(), /*units=*/3, /*batches=*/2,
+                   /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}},
+                   /*output=*/{TensorType_FLOAT32},
+                   /*keep_num_dims=*/true),
+               "Cannot allocate tensors");
+}
+#endif
+
 TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantizedUint8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches=*/2,
@@ -905,9 +961,3 @@ TEST_P(FloatFullyConnectedOpTest, BlackBoxTest) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
index a5e93efb8ff..f90f7b64735 100644
--- a/tensorflow/lite/kernels/gather_nd_test.cc
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -315,9 +315,3 @@ TEST(GatherNdOpTest, Int64Int64) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 8fc6bd173da..18f395d639b 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -296,9 +296,3 @@ TEST(GatherOpTest, SimpleString) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/hashtable_lookup_test.cc b/tensorflow/lite/kernels/hashtable_lookup_test.cc
index d2ca76a2067..5646165e861 100644
--- a/tensorflow/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup_test.cc
@@ -171,9 +171,3 @@ TEST(HashtableLookupOpTest, TestString) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/if_test.cc b/tensorflow/lite/kernels/if_test.cc
index 0f90db131b0..cd7372c978a 100644
--- a/tensorflow/lite/kernels/if_test.cc
+++ b/tensorflow/lite/kernels/if_test.cc
@@ -105,9 +105,3 @@ TEST_F(DynamicSubgraphIfTest, TestIfFalse) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index eb5952e6f66..1e91d0a6b2c 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -2,11 +2,12 @@ load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 tflite_deps_intel = [
     "@arm_neon_2_x86_sse",
@@ -385,6 +386,7 @@ cc_library(
         ":types",
         "@gemmlowp//:fixedpoint",
         "@gemmlowp//:profiler",
+        "//third_party/eigen3",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:op_macros",
     ] + select({
@@ -421,6 +423,7 @@ cc_library(
         ":legacy_types",
         ":tensor",
         ":types",
+        "//third_party/eigen3",
         "@gemmlowp",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:op_macros",
@@ -667,11 +670,29 @@ cc_test(
         ":reference_base",
         ":test_util",
         ":types",
+        "//tensorflow/lite/experimental/ruy:context",
+        "//tensorflow/lite/kernels:cpu_backend_context",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
+cc_test(
+    name = "depthwiseconv_per_channel_quantized_test",
+    srcs = [
+        "depthwiseconv_per_channel_quantized_test.cc",
+    ],
+    shard_count = 2,
+    deps = [
+        ":optimized_base",
+        ":quantization_util",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_float_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_float_test.cc
index bb840a1f2a3..c7994c6838a 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_float_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_float_test.cc
@@ -42,7 +42,7 @@ void TestOneDepthwiseConv(
                                reference_output_data.data());
   optimized_ops::DepthwiseConvImpl(
       params, input_shape, input_data, filter_shape, filter_data, bias_shape,
-      bias_data, output_shape, output_data.data(), /*thread_start=*/0,
+      bias_data, output_shape, output_data.data(), nullptr, /*thread_start=*/0,
       /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
 
   double sum_abs_diff = 0;
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
new file mode 100644
index 00000000000..794d9b280b0
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
@@ -0,0 +1,353 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace {
+
+void PickOutputMultiplier(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    float* output_multiplier) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 input_offset = params.input_offset;
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  int output_accu_min = std::numeric_limits<std::int32_t>::max();
+  int output_accu_max = std::numeric_limits<std::int32_t>::min();
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val + input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            output_accu_max = std::max(acc, output_accu_max);
+            output_accu_min = std::min(acc, output_accu_min);
+          }
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // min/max fit in those ranges correspondingly as much as possible.
+  if (std::abs(output_accu_max) > std::abs(output_accu_min)) {
+    *output_multiplier = 127.0f / std::abs(output_accu_max);
+  } else {
+    *output_multiplier = 128.0f / std::abs(output_accu_min);
+  }
+}
+
+void PickReasonableMultiplier(
+    const DepthwiseParams& params, int output_activation_min,
+    int output_activation_max, int output_depth,
+    const RuntimeShape& input_shape_inference, const std::int8_t* input_data,
+    const RuntimeShape& filter_shape_inference, const std::int8_t* filter_data,
+    const RuntimeShape& bias_shape_inference, const std::int32_t* bias_data,
+    const RuntimeShape& output_shape_inference,
+    std::int32_t* output_multiplier_ptr, std::int32_t* output_shift_ptr,
+    std::int8_t* output_data) {
+  float output_multiplier;
+  PickOutputMultiplier(params, input_shape_inference, input_data,
+                       filter_shape_inference, filter_data,
+                       bias_shape_inference, bias_data, output_shape_inference,
+                       &output_multiplier);
+
+  int base_multiplier;
+  int base_shift;
+  QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
+  for (int i = 0; i < output_depth; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
+    output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
+  }
+}
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+// The reference implementation & the fast kernel have different rounding
+// mechanism, so we loosely compare the difference.
+void CompareRoundingResults(int flat_size, const int depth_multiplier,
+                            const std::int8_t* reference_result,
+                            const std::int8_t* fast_kernel_result) {
+  std::vector<int> diff(flat_size);
+  std::int64_t sum_diff = 0;
+  std::int64_t sum_abs_diff = 0;
+  for (int i = 0; i < flat_size; i++) {
+    diff[i] = static_cast<int>(fast_kernel_result[i]) -
+              static_cast<int>(reference_result[i]);
+    sum_diff += diff[i];
+    sum_abs_diff += std::abs(diff[i]);
+  }
+  // These stats help understand test failures.
+  std::sort(std::begin(diff), std::end(diff));
+  const int min_diff = diff.front();
+  const int max_diff = diff.back();
+  const int median_diff = diff[diff.size() / 2];
+  const float mean_diff = static_cast<float>(sum_diff) / flat_size;
+  const float mean_abs_diff = static_cast<float>(sum_abs_diff) / flat_size;
+
+  // The tolerance that we apply to means is tight, but we allow for a rounding
+  // difference in one pixel, and loosen by another 1% for float comparison.
+  float mean_tolerance =
+      std::max(1e-5f, 1.01f / flat_size * std::sqrt(1.f * depth_multiplier));
+  mean_tolerance = 500.f;
+  const int diff_mean_tolerance = 256;
+  const int diff_median_tolerance = 225;
+
+  // Normally we should require bit-for-bit exact results. Unfortunately a bug
+  // in the Intel arm_neon_sse.h translation header that we use for x86 tests
+  // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
+  // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
+  // few off-by-one errors for now, yet still ensure that no more than a small
+  // minority of values are wrong.
+  EXPECT_LT(std::abs(mean_diff), mean_tolerance);
+  EXPECT_LT(mean_abs_diff, mean_tolerance);
+  EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
+  EXPECT_LE(std::abs(min_diff), diff_mean_tolerance);
+  EXPECT_LE(std::abs(max_diff), diff_mean_tolerance);
+  EXPECT_TRUE(std::abs(mean_diff) < mean_tolerance &&
+              mean_abs_diff < mean_tolerance &&
+              std::abs(median_diff) <= diff_median_tolerance &&
+              std::abs(min_diff) <= diff_mean_tolerance &&
+              std::abs(max_diff) <= diff_mean_tolerance);
+}
+#endif
+
+bool GenerateValidShapeConfigurations(
+    int filter_width, int filter_height, int depth_multiplier,
+    int dilation_width_factor, int dilation_height_factor,
+    RuntimeShape* input_shape_inference, RuntimeShape* filter_shape_inference,
+    RuntimeShape* output_shape_inference, int* pad_width, int* pad_height,
+    int* stride) {
+  const int batch = UniformRandomInt(1, 3);
+  const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
+  const int input_width = UniformRandomInt(5, 50);
+  const int input_height = UniformRandomInt(5, 50);
+  *stride = UniformRandomInt(1, 2);
+  const bool test_pad = UniformRandomInt(0, 1);
+  const auto padding_type = test_pad ? PaddingType::kValid : PaddingType::kSame;
+
+  const int output_depth = input_depth * depth_multiplier;
+
+  input_shape_inference->BuildFrom(
+      {batch, input_height, input_width, input_depth});
+
+  filter_shape_inference->BuildFrom(
+      {1, filter_height, filter_width, output_depth});
+
+  EXPECT_TRUE(ComputeConvSizes(
+      *input_shape_inference, output_depth, filter_width, filter_height,
+      *stride, dilation_width_factor, dilation_height_factor, padding_type,
+      output_shape_inference, pad_width, pad_height));
+
+  // We just care about whether the shape is suitable so we use non-per-channel
+  // case.
+  return optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+      optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
+      *input_shape_inference, *filter_shape_inference, *stride, *stride,
+      dilation_width_factor, dilation_height_factor, *pad_width, *pad_height,
+      depth_multiplier, *output_shape_inference, 0);
+}
+
+void TryTestOneDepthwiseConv3x3Filter() {
+  const int filter_width = 3;
+  const int filter_height = 3;
+  const int depth_multiplier = 1;
+  // We don't support dilations in the 3x3 filter.
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  const std::int32_t input_offset = UniformRandomInt(-25, 25);
+  const std::int32_t output_offset = UniformRandomInt(-25, 25);
+
+  RuntimeShape input_shape_inference;
+  RuntimeShape filter_shape_inference;
+  RuntimeShape output_shape_inference;
+  int pad_width, pad_height;
+  int stride;
+
+  // Keeps trying until we get valid shape/configurations for 3x3 filter case.
+  bool generated_valid_configurations_for_3x3_kernel = false;
+  while (!generated_valid_configurations_for_3x3_kernel) {
+    generated_valid_configurations_for_3x3_kernel =
+        GenerateValidShapeConfigurations(
+            filter_width, filter_height, depth_multiplier,
+            dilation_width_factor, dilation_height_factor,
+            &input_shape_inference, &filter_shape_inference,
+            &output_shape_inference, &pad_width, &pad_height, &stride);
+  }
+
+  const int output_depth = output_shape_inference.Dims(3);
+
+  RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
+  const int input_buffer_size = input_shape_inference.FlatSize();
+  const int filter_buffer_size = filter_shape_inference.FlatSize();
+  const int output_buffer_size = output_shape_inference.FlatSize();
+  std::vector<std::int8_t> input_data(input_buffer_size);
+  std::vector<std::int8_t> filter_data(filter_buffer_size);
+  std::vector<std::int32_t> bias_data(output_depth);
+
+  FillRandom(&input_data);
+  FillRandom(&filter_data);
+  FillRandom(&bias_data, -1000, 1000);
+
+  DepthwiseParams params;
+  params.stride_width = stride;
+  params.stride_height = stride;
+  params.dilation_height_factor = dilation_height_factor;
+  params.dilation_width_factor = dilation_width_factor;
+  params.padding_values.width = pad_width;
+  params.padding_values.height = pad_height;
+  params.depth_multiplier = depth_multiplier;
+  params.input_offset = input_offset;
+  params.output_offset = output_offset;
+  params.quantized_activation_min = output_activation_min;
+  params.quantized_activation_max = output_activation_max;
+
+  std::vector<std::int8_t> reference_output_data(output_buffer_size);
+  std::vector<std::int8_t> neon_output_data(output_buffer_size);
+
+  std::vector<std::int32_t> output_multiplier(output_depth);
+  std::vector<std::int32_t> output_shift(output_depth);
+
+  // It's hard to come up with a right multiplier, random guess basically makes
+  // all the results saturated and becomes meaningfulless, so we first use
+  // reference impl to poke the min/max value of the accumulation, then use that
+  // value as a guided suggestion for us to populate meaningful mulitplier &
+  // shift.
+  PickReasonableMultiplier(
+      params, output_activation_min, output_activation_max, output_depth,
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, output_multiplier.data(), output_shift.data(),
+      reference_output_data.data());
+
+  EXPECT_TRUE(optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+              optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+      input_shape_inference, filter_shape_inference, stride, stride,
+      dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+      depth_multiplier, output_shape_inference, 0, output_shift.data()));
+
+  // The following tests compare referene impl and Neon general impl agrees,
+  // and reference impl loosely agrees with fast kernel since they use different
+  // rounding strategy.
+  reference_integer_ops::DepthwiseConvPerChannel(
+      params, output_multiplier.data(), output_shift.data(),
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, reference_output_data.data());
+
+  optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(
+      params, output_multiplier.data(), output_shift.data(),
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, neon_output_data.data(),
+      /*thread_start=*/0,
+      /*thread_end=*/output_shape_inference.Dims(1), /*thread_dim=*/1);
+
+  EXPECT_EQ(reference_output_data, neon_output_data);
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+  std::vector<std::int8_t> fast_kernel_output_data(output_buffer_size);
+  optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
+      DepthwiseConvOutputRounding::kUpward>(
+      params, output_multiplier.data(), output_shift.data(),
+      input_shape_inference, input_data.data(), filter_shape_inference,
+      filter_data.data(), bias_shape_inference, bias_data.data(),
+      output_shape_inference, fast_kernel_output_data.data(),
+      /*thread_start=*/0,
+      /*thread_end=*/output_shape_inference.Dims(1), /*thread_dim=*/1);
+
+  CompareRoundingResults(output_buffer_size, depth_multiplier,
+                         reference_output_data.data(),
+                         fast_kernel_output_data.data());
+#endif
+}
+
+TEST(QuantizedDepthwiseConvPerChannelTest, FastKernelTest) {
+  for (int i = 0; i < 30; ++i) {
+    TryTestOneDepthwiseConv3x3Filter();
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index 87d97b055c7..287f0a46be0 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/ruy/context.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -162,18 +164,15 @@ inline void DispatchDepthwiseConv(
       break;
     }
     case DepthwiseConvImplementation::kUseNeon3x3DotProduct: {
-#if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
+      // This is compiled-in even if dot-product instructions are unavailable.
+      // However, tests should skip dot-product testing in that case and not
+      // call this code.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
       DotProduct3x3KernelType kernel_type =
           optimized_ops::depthwise_conv::CategorizeDotProductKernel(
               input_shape, filter_shape, params);
 
-      ASSERT_TRUE(
-          kernel_type == DotProduct3x3KernelType::kPlain ||
-          kernel_type == DotProduct3x3KernelType::kStride2 ||
-          kernel_type ==
-              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
-          kernel_type ==
-              DotProduct3x3KernelType::kWithDepthMultiplicationStride2)
+      ASSERT_NE(kernel_type, DotProduct3x3KernelType::kNone)
           << "Kernel type = " << static_cast<int>(kernel_type);
 
       optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
@@ -283,19 +282,22 @@ inline void DispatchDepthwiseConv(
       << " depth = " << input_shape.Dims(3)
       << " buffer need = " << input_shape.Dims(3) * input_shape.Dims(2) * 6
       << " input_offset = " << params.input_offset;
+  CpuBackendContext backend_context;
   switch (test_param.output_rounding) {
     case DepthwiseConvOutputRounding::kAwayFromZero:
       optimized_ops::DepthwiseConvWithRounding<
           DepthwiseConvOutputRounding::kAwayFromZero>(
           params, input_shape, input_data, filter_shape, filter_data,
-          bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
+          bias_shape, bias_data, output_shape, output_data, &backend_context,
+          /*thread_start=*/0,
           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
       return;
     case DepthwiseConvOutputRounding::kUpward:
       optimized_ops::DepthwiseConvWithRounding<
           DepthwiseConvOutputRounding::kUpward>(
           params, input_shape, input_data, filter_shape, filter_data,
-          bias_shape, bias_data, output_shape, output_data, /*thread_start=*/0,
+          bias_shape, bias_data, output_shape, output_data, &backend_context,
+          /*thread_start=*/0,
           /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
       return;
     default:
@@ -689,6 +691,21 @@ void TestOneDepthwiseConv3x3Filter(
 }
 
 void TestOneNeonDot3x3(const TestParam& test_param) {
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+  CpuBackendContext backend_context;
+  ruy::Context* ruy_context = backend_context.ruy_context();
+  const auto ruy_paths = ruy_context != nullptr
+                             ? ruy_context->GetRuntimeEnabledPaths()
+                             : ruy::Path::kNone;
+  const bool has_dot_product_instructions =
+      (ruy_paths & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+  if (test_param.forced_invocation ==
+          DepthwiseConvImplementation::kUseNeon3x3DotProduct &&
+      !has_dot_product_instructions) {
+    return;
+  }
+#endif
+
   while (!TryTestOneNeonDot3x3(test_param, ParamsSpecialization::kSymmetric)) {
   }
 }
@@ -774,7 +791,7 @@ INSTANTIATE_TEST_SUITE_P(
         Values(false)                                  // loose_tolerance
         ),
     TestParam::TestNameSuffix);
-#endif
+#endif  // __aarch64__ && !GOOGLE_L4T
 
 // While 3x3 coverage tests are primarily targeted at specialized kernels, we
 // also run it against the generic kernel.
@@ -821,6 +838,9 @@ INSTANTIATE_TEST_SUITE_P(
     TestParam::TestNameSuffix);
 
 #if defined(USE_NEON)
+// Intrinsics tests are run in emulation mode (such as for dot-product
+// instructions) unless the tests are built specifically with dot-product
+// instructions enabled.
 INSTANTIATE_TEST_SUITE_P(
     Intrinsics, DepthwiseConvTest,
     testing::Combine(
@@ -836,7 +856,7 @@ INSTANTIATE_TEST_SUITE_P(
     TestParam::TestNameSuffix);
 #endif
 
-#if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 INSTANTIATE_TEST_SUITE_P(
     NeonAsm, DepthwiseConvTest,
     testing::Combine(
@@ -850,6 +870,20 @@ INSTANTIATE_TEST_SUITE_P(
         Values(false)                                  // loose_tolerance
         ),
     TestParam::TestNameSuffix);
+
+// Apply the 3x3 tests through the dispatch.
+INSTANTIATE_TEST_SUITE_P(
+    Dispatch3x3, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::kNone),    // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
 #endif
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
index 6867ae2e465..bfa071d9a44 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@@ -412,7 +412,7 @@ inline bool Fast3x3FilterKernelSupported(
 
   if (quantization_type == QuantizationType::kPerChannelInt8) {
     for (int i = 0; i < output_depth; ++i) {
-      if (output_shift_ptr[i] <= 0) {
+      if (output_shift_ptr[i] > 0) {
         return false;
       }
     }
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
index df411220bfb..74e8356bc61 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
 
 #include "profiling/instrumentation.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -895,12 +896,17 @@ inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
 // [thread_start, thread_end).
 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+//
+// The cpu_backend_context may be supplied as a nullptr by some callers. This
+// parameter is included so that the signature matches that required by a
+// templated function. Other versions, such as quantized, need this parameter.
 inline void DepthwiseConvImpl(
     const DepthwiseParams& params, const RuntimeShape& input_shape,
     const float* input_data, const RuntimeShape& filter_shape,
     const float* filter_data, const RuntimeShape& bias_shape,
     const float* bias_data, const RuntimeShape& output_shape,
-    float* output_data, int thread_start, int thread_end, int thread_dim) {
+    float* output_data, CpuBackendContext* cpu_backend_context,
+    int thread_start, int thread_end, int thread_dim) {
   gemmlowp::ScopedProfilingLabel label("DepthwiseConv/float/DepthwiseConvImpl");
 
   const int stride_width = params.stride_width;
@@ -1110,9 +1116,10 @@ inline void DepthwiseConv(
     const float* input_data, const RuntimeShape& filter_shape,
     const float* filter_data, const RuntimeShape& bias_shape,
     const float* bias_data, const RuntimeShape& output_shape,
-    float* output_data) {
+    float* output_data, CpuBackendContext* cpu_backend_context) {
   DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
                     bias_shape, bias_data, output_shape, output_data,
+                    cpu_backend_context,
                     /*thread_start=*/0,
                     /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
 }
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
index 5dd1ee5612b..05a1476b518 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
@@ -36,8 +36,9 @@ struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
                           const RuntimeShape& filter_shape,
                           const T* filter_data, const RuntimeShape& bias_shape,
                           const TS* bias_data, const RuntimeShape& output_shape,
-                          T* output_data, int thread_start, int thread_end,
-                          int thread_dim)
+                          T* output_data,
+                          CpuBackendContext* cpu_backend_context,
+                          int thread_start, int thread_end, int thread_dim)
       : params_(params),
         input_shape_(input_shape),
         input_data_(input_data),
@@ -47,6 +48,7 @@ struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
         bias_data_(bias_data),
         output_shape_(output_shape),
         output_data_(output_data),
+        cpu_backend_context_(cpu_backend_context),
         thread_start_(thread_start),
         thread_end_(thread_end),
         thread_dim_(thread_dim) {}
@@ -54,7 +56,8 @@ struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
   void Run() override {
     DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
                       filter_data_, bias_shape_, bias_data_, output_shape_,
-                      output_data_, thread_start_, thread_end_, thread_dim_);
+                      output_data_, cpu_backend_context_, thread_start_,
+                      thread_end_, thread_dim_);
   }
 
  private:
@@ -67,25 +70,51 @@ struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
   const TS* bias_data_;
   const RuntimeShape& output_shape_;
   T* output_data_;
+  CpuBackendContext* cpu_backend_context_;
   int thread_start_;
   int thread_end_;
   int thread_dim_;
 };
 
 inline int HowManyConvThreads(const RuntimeShape& output_shape,
-                              const RuntimeShape& filter_shape,
-                              int thread_dim) {
-  constexpr int kMinMulPerThread = 8;
-  const int output_units = output_shape.Dims(thread_dim);
+                              const RuntimeShape& filter_shape) {
+  // How many scalar multiplications are needed to make it worth using one
+  // more thread
+  static constexpr int kMinMulPerThread = 1 << 13;  // 8k
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
-  const int num_mul_per_unit =
-      FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
-  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
-  int thread_count = output_units / min_units_per_thread;
+  const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
+  // Try to avoid real runtime divisions if possible by dividing by a
+  // compile-time constant.
+  int thread_count = std::max(1, num_muls / kMinMulPerThread);
   return thread_count;
 }
 
+inline bool MultithreadAlongBatches(int thread_count, int batches) {
+  TFLITE_DCHECK_GE(thread_count, 2);
+  // If there are fewer batch entries than the number of threads we want to use,
+  // then better do intra-batch-entry multithreading.
+  if (batches < thread_count) {
+    return false;
+  }
+  // If there are at least 2 batch entries to be handed to each thread, then
+  // it's safe to proceed with batch-wise multithreading: each thread will have
+  // approximately equal number of batch entries to handle, so the load
+  // balancing will be reasonable, and the amount to which the load is not
+  // perfectly balanced will be offset by the inherent advantages of
+  // batch-wise multithreading (each thread is more efficient thanks to working
+  // on larger buffers with less boundary-handling overhead).
+  if (batches >= 2 * thread_count) {
+    return true;
+  }
+  // In the limit case were there are at least 1 but not much more than 1
+  // batch entries per thread, it may be a good idea to do per-batch
+  // multithreading if the number of batch entries is a multiple of the number
+  // of threads, so that each thread will have the same number of batch entries
+  // to process.
+  return ((batches % thread_count) == 0);
+}
+
 template <typename T, typename TS>
 inline void DepthwiseConv(const DepthwiseParams& params,
                           const RuntimeShape& input_shape, const T* input_data,
@@ -100,52 +129,53 @@ inline void DepthwiseConv(const DepthwiseParams& params,
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
-  const int output_batches = output_shape.Dims(0);
-  const int output_height = output_shape.Dims(1);
-  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
-  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
-  int thread_dim, thread_count, thread_dim_size;
-  if (thread_count_batch > thread_count_row) {
-    thread_dim = 0;
-    thread_dim_size = output_batches;
-    thread_count = thread_count_batch;
-  } else {
-    thread_dim = 1;
-    thread_dim_size = output_height;
-    thread_count = thread_count_row;
-  }
-
+  int thread_count = HowManyConvThreads(output_shape, filter_shape);
   const int max_threads = cpu_backend_context->max_num_threads();
   thread_count = std::max(1, std::min(thread_count, max_threads));
-
+#ifndef TFLITE_WITH_RUY
   // Cap the number of threads to 2 for float path to avoid regression in
   // performance (b/132294857).
   if (std::is_floating_point<T>::value) {
     thread_count = std::min(thread_count, 2);
   }
+#endif
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
 
   if (thread_count == 1) {
     DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
                       filter_data, bias_shape, bias_data, output_shape,
-                      output_data, /*thread_start=*/0,
+                      output_data, cpu_backend_context, /*thread_start=*/0,
                       /*thread_end=*/output_height, /*thread_dim=*/1);
-  } else {
-    std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
-    // TODO(b/131746020) don't create new heap allocations every time.
-    // At least we make it a single heap allocation by using reserve().
-    tasks.reserve(thread_count);
-    int thread_start = 0;
-    for (int i = 0; i < thread_count; ++i) {
-      int thread_end =
-          thread_start + (thread_dim_size - thread_start) / (thread_count - i);
-      tasks.emplace_back(params, input_shape, input_data, filter_shape,
-                         filter_data, bias_shape, bias_data, output_shape,
-                         output_data, thread_start, thread_end, thread_dim);
-      thread_start = thread_end;
-    }
-    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
-                                    cpu_backend_context);
+    return;
   }
+
+  int thread_dim, thread_dim_size;
+  if (MultithreadAlongBatches(thread_count, output_batches)) {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+  } else {
+    thread_dim = 1;
+    thread_dim_size = output_height;
+  }
+
+  std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
+  // TODO(b/131746020) don't create new heap allocations every time.
+  // At least we make it a single heap allocation by using reserve().
+  tasks.reserve(thread_count);
+  int thread_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int thread_end =
+        thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+    tasks.emplace_back(params, input_shape, input_data, filter_shape,
+                       filter_data, bias_shape, bias_data, output_shape,
+                       output_data, cpu_backend_context, thread_start,
+                       thread_end, thread_dim);
+    thread_start = thread_end;
+  }
+  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
+                                  cpu_backend_context);
 }
 
 }  // namespace optimized_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 3fd2a8f0da2..edf59248e13 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1986,7 +1986,8 @@ inline void DepthwiseConvWithRounding(
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
+    uint8* output_data, CpuBackendContext* cpu_backend_context,
+    int thread_start, int thread_end, int thread_dim) {
   gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
   const int depth_multiplier = params.depth_multiplier;
   const int32 output_activation_min = params.quantized_activation_min;
@@ -2007,6 +2008,30 @@ inline void DepthwiseConvWithRounding(
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
+  // Dispatch to dot-product 3x3 kernels when supported.
+
+  ruy::Context* ruy_context = cpu_backend_context->ruy_context();
+  const bool has_dot_product_instructions =
+      ruy_context != nullptr && (ruy_context->GetRuntimeEnabledPaths() &
+                                 ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+  if (has_dot_product_instructions) {
+    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+    DotProduct3x3KernelType kernel_type =
+        optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+            input_shape, filter_shape, params);
+    if (kernel_type != DotProduct3x3KernelType::kNone) {
+      gemmlowp::ScopedProfilingLabel specialized_label(
+          "DepthwiseConv/8bit/3x3XDotProduct");
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
+    }
+  }
+
+  // Dispatch to non-dot-product 3x3 kernels when supported.
+
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int pad_width = params.padding_values.width;
@@ -2041,11 +2066,12 @@ inline void DepthwiseConvImpl(
     const uint8* input_data, const RuntimeShape& filter_shape,
     const uint8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
+    uint8* output_data, CpuBackendContext* cpu_backend_context,
+    int thread_start, int thread_end, int thread_dim) {
   return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
       params, input_shape, input_data, filter_shape, filter_data, bias_shape,
-      bias_data, output_shape, output_data, thread_start, thread_end,
-      thread_dim);
+      bias_data, output_shape, output_data, cpu_backend_context, thread_start,
+      thread_end, thread_dim);
 }
 
 void DepthwiseConv(const DepthwiseParams& params,
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index c9d2d10a678..5b570eed18b 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
 
-#include <cstddef>
 #include <memory>
 
 #include "profiling/instrumentation.h"
@@ -137,7 +136,9 @@ static_assert(offsetof(DepthwiseConvParams, output_height) ==
 #endif  // ARM NEON
 
 #ifdef USE_NEON
-#if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+// Dot product ops hard-coded
+
 // Represents the number of bytes offset from the start of the
 // DepthwiseConvDotProdParams struct. This is used in the asm to load
 // parameters. Keep these values in sync with the static_asserts below.
@@ -284,7 +285,7 @@ static_assert(offsetof(DepthwiseConvDotProdParams, workspace_height_stride) ==
 static_assert(offsetof(DepthwiseConvDotProdParams, four_over_stride) ==
                   DP_OFFSET_FOUR_OVER_STRIDE,
               "");
-#endif  // __ARM_FEATURE_DOTPROD && !GOOGLE_L4T
+#endif  // __aarch64__ && !GOOGLE_L4T - Dot product ops hard-coded
 
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 
@@ -5775,9 +5776,9 @@ struct WorkspacePrefetchWrite<
     DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
   static void __attribute__((noinline))
   Run(int8 fill_data, int size, int8* workspace) {
-    const int8x8_t fill_data_vec = vdup_n_s8(fill_data);
-    int i = 0;
-    for (; i < (size - 15); i += 64) {
+    const int8x8_t fill_data_vec_int8 = vdup_n_s8(fill_data);
+    const uint32x2_t fill_data_vec = vreinterpret_u32_s8(fill_data_vec_int8);
+    for (int i = 0; i < (size - 15); i += 64) {
       int8* ptr = workspace + i;
       asm volatile("prfm pstl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
       vst1_lane_u32(reinterpret_cast<uint32_t*>(ptr), fill_data_vec, 0);
@@ -5786,9 +5787,11 @@ struct WorkspacePrefetchWrite<
                   fill_data_vec, 0);
   }
 };
+
 #endif  // USE_NEON &&__aarch64__
 
-#if defined(__ARM_FEATURE_DOTPROD) && !defined(GOOGLE_L4T)
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+// Dot product ops hard-coded
 
 template <>
 struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
@@ -5802,6 +5805,8 @@ struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
     // x2 %[shuffled_filter_data]
     // x3 %[adjusted_bias_data]
     // x4 %[function_params]
+#define DC_PER_DEPTH_1 "1"
+#define DC_PER_DEPTH_2 "2"
 
     asm volatile(
         "ldp    w12, w11, [%[function_params], #" STR(DP_OFFSET_BIAS_INCREMENT) "]\n"
@@ -5824,12 +5829,12 @@ struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
         // implicit-def: $q17
         // implicit-def: $q18
         // implicit-def: $q19
-        "b      DC_PER_DEPTH_2\n"
-        "   DC_PER_DEPTH_1:\n"  // in Loop: Header=BB177_2 Depth=1
+        "b      " DC_PER_DEPTH_2 "f\n"
+        DC_PER_DEPTH_1 ":\n"  // in Loop: Header=BB177_2 Depth=1
         "add    x13, %[filter_data], x8, lsl #3\n"
         "ld1    { v19.d }[0], [x13], x9\n"
-        "movi   v21.2d, #0\n"
-        "movi   v20.2d, #0\n"
+        "movi   v21.16b, #0\n"
+        "movi   v20.16b, #0\n"
         "add    x8, x8, #1\n"  // =1
         "ld1    { v18.d }[0], [x13], x9\n"
         "ld1    { v17.d }[0], [x13], x9\n"
@@ -5873,9 +5878,9 @@ struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct> {
         "mla    v23.4s, v21.4s, v1.4s\n"
         "add    %[bias_data], x1, x11\n"
         "stp    q22, q23, [%[adjusted_bias_data]], #32\n"
-        "   DC_PER_DEPTH_2:\n"  // =>This Inner Loop Header: Depth=1
+        DC_PER_DEPTH_2 ":\n"  // =>This Inner Loop Header: Depth=1
         "cmp    w8, w10\n"
-        "b.lt   DC_PER_DEPTH_1\n"
+        "b.lt   " DC_PER_DEPTH_1 "b\n"
         :
         // Outputs.
         [ filter_data ] "+r"(filter_data),
@@ -7161,6 +7166,32 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     // x2 %[bias_data]
     // x3 %[output_block_data]
     // x4 %[function_params]
+#define DC_KERNEL_NO_MULT_1 "1"
+#define DC_KERNEL_NO_MULT_2 "2"
+#define DC_KERNEL_NO_MULT_3 "3"
+#define DC_KERNEL_NO_MULT_4 "4"
+#define DC_KERNEL_NO_MULT_5 "5"
+#define DC_KERNEL_NO_MULT_6 "6"
+#define DC_KERNEL_NO_MULT_7 "7"
+#define DC_KERNEL_NO_MULT_8 "8"
+#define DC_KERNEL_NO_MULT_9 "9"
+#define DC_KERNEL_NO_MULT_10 "10"
+#define DC_KERNEL_NO_MULT_11 "11"
+#define DC_KERNEL_NO_MULT_12 "12"
+#define DC_KERNEL_NO_MULT_13 "13"
+#define DC_KERNEL_NO_MULT_14 "14"
+#define DC_KERNEL_NO_MULT_15 "15"
+#define DC_KERNEL_NO_MULT_16 "16"
+#define DC_KERNEL_NO_MULT_17 "17"
+#define DC_KERNEL_NO_MULT_18 "18"
+#define DC_KERNEL_NO_MULT_19 "19"
+#define DC_KERNEL_NO_MULT_20 "20"
+#define DC_KERNEL_NO_MULT_21 "21"
+#define DC_KERNEL_NO_MULT_22 "22"
+#define DC_KERNEL_NO_MULT_23 "23"
+#define DC_KERNEL_NO_MULT_24 "24"
+#define DC_KERNEL_NO_MULT_25 "25"
+#define DC_KERNEL_NO_MULT_26 "26"
 
     asm volatile(
         "sub    sp, sp, #288\n"  // =448
@@ -7245,8 +7276,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "str    x12, [sp, #120]\n"  // 8-byte Folded Spill
         "str    %[output_block_data], [sp, #40]\n"  // 8-byte Folded Spill
         "stp    d6, d5, [sp, #72]\n"  // 8-byte Folded Spill
-        "b      DC_KERNEL_NO_MULT_26\n"
-        "   DC_KERNEL_NO_MULT_1:\n"  // in Loop: Header=BB225_26 Depth=1
+        "b      " DC_KERNEL_NO_MULT_26 "f\n"
+        DC_KERNEL_NO_MULT_1 ":\n"  // in Loop: Header=BB225_26 Depth=1
         "ldr    x10, [sp, #32]\n"  // 8-byte Folded Reload
         "str    w6, [sp, #28]\n"  // 4-byte Folded Spill
         "ldr    w12, [sp, #132]\n"  // 4-byte Folded Reload
@@ -7256,13 +7287,13 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "cmp    w12, #4\n"  // =4
         "add    x10, x10, #96\n"  // =96
         "str    x10, [sp, #32]\n"  // 8-byte Folded Spill
-        "b.ne   DC_KERNEL_NO_MULT_14\n"
+        "b.ne   " DC_KERNEL_NO_MULT_14 "f\n"
         // %bb.2:        // in Loop: Header=BB225_26 Depth=1
         "ldp    %[scratch_block_data], x13, [sp, #48]\n"  // 8-byte Folded Reload
         "ldr    x6, [sp, #64]\n"  // 8-byte Folded Reload
         "mov    x12, xzr\n"
-        "b      DC_KERNEL_NO_MULT_13\n"
-        "   DC_KERNEL_NO_MULT_3:\n"  // in Loop: Header=BB225_13 Depth=2
+        "b      " DC_KERNEL_NO_MULT_13 "f\n"
+        DC_KERNEL_NO_MULT_3 ":\n"  // in Loop: Header=BB225_13 Depth=2
         "ldr    x10, [sp, #136]\n"  // 8-byte Folded Reload
         "str    x12, [sp, #168]\n"  // 8-byte Folded Spill
         "ldr    q21, [x6]\n"
@@ -7296,8 +7327,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "stp    %[scratch_block_data], x13, [sp, #152]\n"  // 8-byte Folded Spill
         "mov    x12, x13\n"
         "ldr    x13, [sp, #144]\n"  // 8-byte Folded Reload
-        "b      DC_KERNEL_NO_MULT_5\n"
-        "   DC_KERNEL_NO_MULT_4:\n"  // in Loop: Header=BB225_5 Depth=3
+        "b      " DC_KERNEL_NO_MULT_5 "f\n"
+        DC_KERNEL_NO_MULT_4 ":\n"  // in Loop: Header=BB225_5 Depth=3
         ".word 0x4e8e965f  // sdot   v31.4s, v18.16b, v14.16b\n"
         ".word 0x4e979648  // sdot   v8.4s, v18.16b, v23.16b\n"
         ".word 0x4e999669  // sdot   v9.4s, v19.16b, v25.16b\n"
@@ -7487,26 +7518,26 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    %[scratch_block_data], x0, %[bias_data]\n"
         "add    x12, x12, #32\n"  // =32
         "mov    v14.16b, v22.16b\n"
-        "   DC_KERNEL_NO_MULT_5:\n"  // Parent Loop BB225_26 Depth=1
+        DC_KERNEL_NO_MULT_5 ":\n"  // Parent Loop BB225_26 Depth=1
         // Parent Loop BB225_13 Depth=2
         // =>  This Inner Loop Header: Depth=3
         "cmp    w17, w14\n"
-        "b.lt   DC_KERNEL_NO_MULT_4\n"
+        "b.lt   " DC_KERNEL_NO_MULT_4 "b\n"
         // %bb.6:        // in Loop: Header=BB225_13 Depth=2
         "ldp    d6, d5, [sp, #72]\n"  // 8-byte Folded Reload
         "cmp    w18, #0\n"  // =0
         "add    x6, x6, #16\n"  // =16
         "str    x6, [sp, #224]\n"  // 8-byte Folded Spill
-        "b.le   DC_KERNEL_NO_MULT_12\n"
+        "b.le   " DC_KERNEL_NO_MULT_12 "f\n"
         // %bb.7:        // in Loop: Header=BB225_13 Depth=2
-        "movi   v28.2d, #0\n"
+        "movi   v28.16b, #0\n"
         "cmp    w18, #3\n"  // =3
-        "movi   v29.2d, #0\n"
-        "movi   v30.2d, #0\n"
-        "movi   v11.2d, #0\n"
-        "movi   v12.2d, #0\n"
-        "movi   v13.2d, #0\n"
-        "b.lt   DC_KERNEL_NO_MULT_9\n"
+        "movi   v29.16b, #0\n"
+        "movi   v30.16b, #0\n"
+        "movi   v11.16b, #0\n"
+        "movi   v12.16b, #0\n"
+        "movi   v13.16b, #0\n"
+        "b.lt   " DC_KERNEL_NO_MULT_9 "f\n"
         // %bb.8:        // in Loop: Header=BB225_13 Depth=2
         "ldr    q28, [x9, x12]\n"
         "ldr    q29, [x24, x12]\n"
@@ -7514,7 +7545,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ldr    q11, [x8, x12]\n"
         "ldr    q12, [x22, x12]\n"
         "ldr    q13, [x15, x12]\n"
-        "   DC_KERNEL_NO_MULT_9:\n"  // in Loop: Header=BB225_13 Depth=2
+        DC_KERNEL_NO_MULT_9 ":\n"  // in Loop: Header=BB225_13 Depth=2
         "ldr    x6, [sp, #144]\n"  // 8-byte Folded Reload
         "mov    x12, xzr\n"
         "mov    w17, wzr\n"
@@ -7522,8 +7553,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    x13, x5, %[scratch_block_data]\n"
         "add    %[output_block_data], x11, %[scratch_block_data]\n"
         "add    %[scratch_block_data], x6, x0\n"
-        "b      DC_KERNEL_NO_MULT_11\n"
-        "   DC_KERNEL_NO_MULT_10:\n"  // in Loop: Header=BB225_11 Depth=3
+        "b      " DC_KERNEL_NO_MULT_11 "f\n"
+        DC_KERNEL_NO_MULT_10 ":\n"  // in Loop: Header=BB225_11 Depth=3
         ".word 0x4e8e965f  // sdot   v31.4s, v18.16b, v14.16b\n"
         ".word 0x4e979648  // sdot   v8.4s, v18.16b, v23.16b\n"
         ".word 0x4e999669  // sdot   v9.4s, v19.16b, v25.16b\n"
@@ -7587,12 +7618,12 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         ".word 0x4e989649  // sdot   v9.4s, v18.16b, v24.16b\n"
         ".word 0x4e99964a  // sdot   v10.4s, v18.16b, v25.16b\n"
         "add    x12, x12, x16\n"
-        "   DC_KERNEL_NO_MULT_11:\n"  // Parent Loop BB225_26 Depth=1
+        DC_KERNEL_NO_MULT_11 ":\n"  // Parent Loop BB225_26 Depth=1
         // Parent Loop BB225_13 Depth=2
         // =>  This Inner Loop Header: Depth=3
         "cmp    w17, w18\n"
-        "b.lt   DC_KERNEL_NO_MULT_10\n"
-        "   DC_KERNEL_NO_MULT_12:\n"  // in Loop: Header=BB225_13 Depth=2
+        "b.lt   " DC_KERNEL_NO_MULT_10 "b\n"
+        DC_KERNEL_NO_MULT_12 ":\n"  // in Loop: Header=BB225_13 Depth=2
         "ldp    x13, x12, [sp, #160]\n"  // 8-byte Folded Reload
         "ldr    %[scratch_block_data], [sp, #152]\n"  // 8-byte Folded Reload
         "ldr    x6, [sp, #224]\n"  // 8-byte Folded Reload
@@ -7602,21 +7633,21 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    x13, x13, #16\n"  // =16
         "mov    v19.16b, v16.16b\n"
         "mov    v18.16b, v7.16b\n"
-        "   DC_KERNEL_NO_MULT_13:\n"  // Parent Loop BB225_26 Depth=1
+        DC_KERNEL_NO_MULT_13 ":\n"  // Parent Loop BB225_26 Depth=1
         // =>  This Loop Header: Depth=2
         // Child Loop BB225_5 Depth 3
         // Child Loop BB225_11 Depth 3
         "cmp    x12, #2\n"  // =2
-        "b.ne   DC_KERNEL_NO_MULT_3\n"
-        "b      DC_KERNEL_NO_MULT_25\n"
-        "   DC_KERNEL_NO_MULT_14:\n"  // in Loop: Header=BB225_26 Depth=1
+        "b.ne   " DC_KERNEL_NO_MULT_3 "b\n"
+        "b      " DC_KERNEL_NO_MULT_25 "f\n"
+        DC_KERNEL_NO_MULT_14 ":\n"  // in Loop: Header=BB225_26 Depth=1
         "ldr    x10, [sp, #64]\n"  // 8-byte Folded Reload
         "ldr    x17, [sp, #40]\n"  // 8-byte Folded Reload
         "ldr    %[output_block_data], [sp, #136]\n"  // 8-byte Folded Reload
         "mov    w12, wzr\n"
         "ldp    q21, q22, [x10]\n"
-        "b      DC_KERNEL_NO_MULT_24\n"
-        "   DC_KERNEL_NO_MULT_15:\n"  // in Loop: Header=BB225_24 Depth=2
+        "b      " DC_KERNEL_NO_MULT_24 "f\n"
+        DC_KERNEL_NO_MULT_15 ":\n"  // in Loop: Header=BB225_24 Depth=2
         "ldr    x10, [sp, #184]\n"  // 8-byte Folded Reload
         "str    w12, [sp, #224]\n"  // 4-byte Folded Spill
         "ldp    q23, q24, [%[output_block_data]]\n"
@@ -7628,20 +7659,20 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    x10, %[output_block_data], x10\n"
         "ldp    q27, q28, [x10]\n"
         "str    x17, [sp, #208]\n"  // 8-byte Folded Spill
-        "b      DC_KERNEL_NO_MULT_22\n"
-        "   DC_KERNEL_NO_MULT_16:\n"  // in Loop: Header=BB225_22 Depth=3
+        "b      " DC_KERNEL_NO_MULT_22 "f\n"
+        DC_KERNEL_NO_MULT_16 ":\n"  // in Loop: Header=BB225_22 Depth=3
         "cmp    w12, w14\n"
         "orr    w13, wzr, #0x4\n"
         "csel   w13, w18, w13, eq\n"
         "add    x10, %[output_block_data], #32\n"  // =32
-        "movi   v29.2d, #0\n"
-        "movi   v30.2d, #0\n"
-        "movi   v8.2d, #0\n"
-        "movi   v31.2d, #0\n"
+        "movi   v29.16b, #0\n"
+        "movi   v30.16b, #0\n"
+        "movi   v8.16b, #0\n"
+        "movi   v31.16b, #0\n"
         "cmp    w13, #3\n"  // =3
-        "movi   v9.2d, #0\n"
-        "movi   v10.2d, #0\n"
-        "b.lt   DC_KERNEL_NO_MULT_18\n"
+        "movi   v9.16b, #0\n"
+        "movi   v10.16b, #0\n"
+        "b.lt   " DC_KERNEL_NO_MULT_18 "f\n"
         // %bb.17:        // in Loop: Header=BB225_22 Depth=3
         "ldr    %[scratch_block_data], [sp, #184]\n"  // 8-byte Folded Reload
         "ldp    q29, q31, [%[output_block_data], #32]\n"
@@ -7650,10 +7681,10 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ldp    q30, q9, [x6]\n"
         "add    %[scratch_block_data], x10, x0\n"
         "ldp    q8, q10, [%[scratch_block_data]]\n"
-        "   DC_KERNEL_NO_MULT_18:\n"  // in Loop: Header=BB225_22 Depth=3
+        DC_KERNEL_NO_MULT_18 ":\n"  // in Loop: Header=BB225_22 Depth=3
         "mov    w3, wzr\n"
-        "b      DC_KERNEL_NO_MULT_20\n"
-        "   DC_KERNEL_NO_MULT_19:\n"  // in Loop: Header=BB225_20 Depth=4
+        "b      " DC_KERNEL_NO_MULT_20 "f\n"
+        DC_KERNEL_NO_MULT_19 ":\n"  // in Loop: Header=BB225_20 Depth=4
         "mov    v3.16b, v21.16b\n"
         "mov    v11.16b, v22.16b\n"
         ".word 0x4e979643  // sdot   v3.4s, v18.16b, v23.16b\n"
@@ -7693,22 +7724,22 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "str    d3, [x17]\n"
         "add    x17, x17, x16\n"
         "add    w3, w3, #1\n"  // =1
-        "   DC_KERNEL_NO_MULT_20:\n"  // Parent Loop BB225_26 Depth=1
+        DC_KERNEL_NO_MULT_20 ":\n"  // Parent Loop BB225_26 Depth=1
         // Parent Loop BB225_24 Depth=2
         // Parent Loop BB225_22 Depth=3
         // =>  This Inner Loop Header: Depth=4
         "cmp    w3, w13\n"
-        "b.lt   DC_KERNEL_NO_MULT_19\n"
+        "b.lt   " DC_KERNEL_NO_MULT_19 "b\n"
         // %bb.21:        // in Loop: Header=BB225_22 Depth=3
         "add    w12, w12, #1\n"  // =1
         "mov    %[output_block_data], x10\n"
-        "   DC_KERNEL_NO_MULT_22:\n"  // Parent Loop BB225_26 Depth=1
+        DC_KERNEL_NO_MULT_22 ":\n"  // Parent Loop BB225_26 Depth=1
         // Parent Loop BB225_24 Depth=2
         // =>  This Loop Header: Depth=3
         // Child Loop BB225_20 Depth 4
         "ldr    w10, [sp, #252]\n"  // 4-byte Folded Reload
         "cmp    w12, w10\n"
-        "b.lt   DC_KERNEL_NO_MULT_16\n"
+        "b.lt   " DC_KERNEL_NO_MULT_16 "b\n"
         // %bb.23:        // in Loop: Header=BB225_24 Depth=2
         "ldr    x10, [sp, #120]\n"  // 8-byte Folded Reload
         "ldr    x17, [sp, #208]\n"  // 8-byte Folded Reload
@@ -7716,14 +7747,14 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ldr    %[output_block_data], [sp, #192]\n"  // 8-byte Folded Reload
         "add    x17, x17, x10\n"
         "add    w12, w12, #1\n"  // =1
-        "   DC_KERNEL_NO_MULT_24:\n"  // Parent Loop BB225_26 Depth=1
+        DC_KERNEL_NO_MULT_24 ":\n"  // Parent Loop BB225_26 Depth=1
         // =>  This Loop Header: Depth=2
         // Child Loop BB225_22 Depth 3
         // Child Loop BB225_20 Depth 4
         "ldr    w10, [sp, #132]\n"  // 4-byte Folded Reload
         "cmp    w12, w10\n"
-        "b.lt   DC_KERNEL_NO_MULT_15\n"
-        "   DC_KERNEL_NO_MULT_25:\n"  // in Loop: Header=BB225_26 Depth=1
+        "b.lt   " DC_KERNEL_NO_MULT_15 "b\n"
+        DC_KERNEL_NO_MULT_25 ":\n"  // in Loop: Header=BB225_26 Depth=1
         "ldr    x10, [sp, #64]\n"  // 8-byte Folded Reload
         "ldr    x12, [sp, #16]\n"  // 8-byte Folded Reload
         "ldr    w6, [sp, #28]\n"  // 4-byte Folded Reload
@@ -7742,7 +7773,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ldr    x10, [sp, #56]\n"  // 8-byte Folded Reload
         "add    x10, x10, x12\n"
         "str    x10, [sp, #56]\n"  // 8-byte Folded Spill
-        "   DC_KERNEL_NO_MULT_26:\n"  // =>This Loop Header: Depth=1
+        DC_KERNEL_NO_MULT_26 ":\n"  // =>This Loop Header: Depth=1
         // Child Loop BB225_24 Depth 2
         // Child Loop BB225_22 Depth 3
         // Child Loop BB225_20 Depth 4
@@ -7751,7 +7782,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         // Child Loop BB225_11 Depth 3
         "ldr    w10, [sp, #24]\n"  // 4-byte Folded Reload
         "cmp    w6, w10\n"
-        "b.lt   DC_KERNEL_NO_MULT_1\n"
+        "b.lt   " DC_KERNEL_NO_MULT_1 "b\n"
         // %bb.27:
         "add    sp, sp, #288\n"  // =448
         :
@@ -7777,6 +7808,33 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "x26", "x27", "x28", "x29", "x30");
   }  // NOLINT(readability/fn_size) Manually unrolled.
 
+#undef DC_KERNEL_NO_MULT_1
+#undef DC_KERNEL_NO_MULT_2
+#undef DC_KERNEL_NO_MULT_3
+#undef DC_KERNEL_NO_MULT_4
+#undef DC_KERNEL_NO_MULT_5
+#undef DC_KERNEL_NO_MULT_6
+#undef DC_KERNEL_NO_MULT_7
+#undef DC_KERNEL_NO_MULT_8
+#undef DC_KERNEL_NO_MULT_9
+#undef DC_KERNEL_NO_MULT_10
+#undef DC_KERNEL_NO_MULT_11
+#undef DC_KERNEL_NO_MULT_12
+#undef DC_KERNEL_NO_MULT_13
+#undef DC_KERNEL_NO_MULT_14
+#undef DC_KERNEL_NO_MULT_15
+#undef DC_KERNEL_NO_MULT_16
+#undef DC_KERNEL_NO_MULT_17
+#undef DC_KERNEL_NO_MULT_18
+#undef DC_KERNEL_NO_MULT_19
+#undef DC_KERNEL_NO_MULT_20
+#undef DC_KERNEL_NO_MULT_21
+#undef DC_KERNEL_NO_MULT_22
+#undef DC_KERNEL_NO_MULT_23
+#undef DC_KERNEL_NO_MULT_24
+#undef DC_KERNEL_NO_MULT_25
+#undef DC_KERNEL_NO_MULT_26
+
   static void __attribute__((noinline))
   Run(const int8* scratch_block_data, const int8* filter_workspace,
       const int32* bias_data, uint8* output_block_data,
@@ -7800,6 +7858,25 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     // x2 %[bias_data]
     // x3 %[output_block_data]
     // x4 %[function_params]
+#define DC_KERNEL_NO_MULT_STRIDE_1 "1"
+#define DC_KERNEL_NO_MULT_STRIDE_2 "2"
+#define DC_KERNEL_NO_MULT_STRIDE_3 "3"
+#define DC_KERNEL_NO_MULT_STRIDE_4 "4"
+#define DC_KERNEL_NO_MULT_STRIDE_5 "5"
+#define DC_KERNEL_NO_MULT_STRIDE_6 "6"
+#define DC_KERNEL_NO_MULT_STRIDE_7 "7"
+#define DC_KERNEL_NO_MULT_STRIDE_8 "8"
+#define DC_KERNEL_NO_MULT_STRIDE_9 "9"
+#define DC_KERNEL_NO_MULT_STRIDE_10 "10"
+#define DC_KERNEL_NO_MULT_STRIDE_11 "11"
+#define DC_KERNEL_NO_MULT_STRIDE_12 "12"
+#define DC_KERNEL_NO_MULT_STRIDE_13 "13"
+#define DC_KERNEL_NO_MULT_STRIDE_14 "14"
+#define DC_KERNEL_NO_MULT_STRIDE_15 "15"
+#define DC_KERNEL_NO_MULT_STRIDE_16 "16"
+#define DC_KERNEL_NO_MULT_STRIDE_17 "17"
+#define DC_KERNEL_NO_MULT_STRIDE_18 "18"
+#define DC_KERNEL_NO_MULT_STRIDE_19 "19"
 
     asm volatile(
         "sub    sp, sp, #32\n"  // =192
@@ -7848,8 +7925,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "stp    %[filter_workspace], x24, [sp, #16]\n"  // 8-byte Folded Spill
         "str    w25, [sp, #12]\n"  // 4-byte Folded Spill
         "str    %[scratch_block_data], [sp]\n"  // 8-byte Folded Spill
-        "b      DC_KERNEL_NO_MULT_STRIDE_19\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_1:\n"  // in Loop: Header=BB227_19 Depth=1
+        "b      " DC_KERNEL_NO_MULT_STRIDE_19 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_1 ":\n"  // in Loop: Header=BB227_19 Depth=1
         "and    x15, x8, #0x1fffffff\n"
         "add    w16, w8, w8, lsl #1\n"
         "add    x20, %[output_block_data], x15, lsl #3\n"
@@ -7858,7 +7935,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    x21, %[filter_workspace], x15\n"
         "mov    x22, xzr\n"
         "mov    x23, xzr\n"
-        "b.ne   DC_KERNEL_NO_MULT_STRIDE_11\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_11 "f\n"
         // %bb.2:        // in Loop: Header=BB227_19 Depth=1
         "sxtw   x15, w8\n"
         "ubfiz  %[filter_workspace], x8, #3, #29\n"
@@ -7868,8 +7945,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    x25, %[output_block_data], %[filter_workspace]\n"
         "add    x26, x20, x9\n"
         "mov    x27, %[bias_data]\n"
-        "b      DC_KERNEL_NO_MULT_STRIDE_9\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_3:\n"  // in Loop: Header=BB227_9 Depth=2
+        "b      " DC_KERNEL_NO_MULT_STRIDE_9 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_3 ":\n"  // in Loop: Header=BB227_9 Depth=2
         "add    %[filter_workspace], x24, x23, lsl #4\n"
         "add    x15, x21, x23, lsl #4\n"
         "ldr    q8, [%[filter_workspace], x5]\n"
@@ -7886,8 +7963,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    x15, x25, x23, lsl #2\n"
         "mov    x29, x22\n"
         "mov    v9.16b, v8.16b\n"
-        "b      DC_KERNEL_NO_MULT_STRIDE_5\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_4:\n"  // in Loop: Header=BB227_5 Depth=3
+        "b      " DC_KERNEL_NO_MULT_STRIDE_5 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_4 ":\n"  // in Loop: Header=BB227_5 Depth=3
         "mov    v23.16b, v24.16b\n"
         "mov    v10.16b, v24.16b\n"
         ".word 0x4e9e9737  // sdot   v23.4s, v25.16b, v30.16b\n"
@@ -7957,13 +8034,13 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "mov    v31.16b, v22.16b\n"
         "mov    v9.16b, v8.16b\n"
         "mov    v23.16b, v8.16b\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_5:\n"  // Parent Loop BB227_19 Depth=1
+        DC_KERNEL_NO_MULT_STRIDE_5 ":\n"  // Parent Loop BB227_19 Depth=1
         // Parent Loop BB227_9 Depth=2
         // =>  This Inner Loop Header: Depth=3
         "cmp    x28, x7\n"
-        "b.lt   DC_KERNEL_NO_MULT_STRIDE_4\n"
-        "b      DC_KERNEL_NO_MULT_STRIDE_7\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_6:\n"  // in Loop: Header=BB227_7 Depth=3
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_4 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_7 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_6 ":\n"  // in Loop: Header=BB227_7 Depth=3
         "mov    v8.16b, v24.16b\n"
         "mov    v10.16b, v24.16b\n"
         ".word 0x4e9e9728  // sdot   v8.4s, v25.16b, v30.16b\n"
@@ -7997,29 +8074,29 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "str    s8, [x20, x29]\n"
         "st1    { v8.s }[1], [x15]\n"
         "add    x29, x29, x6\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_7:\n"  // Parent Loop BB227_19 Depth=1
+        DC_KERNEL_NO_MULT_STRIDE_7 ":\n"  // Parent Loop BB227_19 Depth=1
         // Parent Loop BB227_9 Depth=2
         // =>  This Inner Loop Header: Depth=3
         "cmp    x28, x11\n"
-        "b.lt   DC_KERNEL_NO_MULT_STRIDE_6\n"
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_6 "b\n"
         // %bb.8:        // in Loop: Header=BB227_9 Depth=2
         "add    x27, x27, #16\n"  // =16
         "add    x23, x23, #1\n"  // =1
         "add    x22, x22, #4\n"  // =4
-        "   DC_KERNEL_NO_MULT_STRIDE_9:\n"  // Parent Loop BB227_19 Depth=1
+        DC_KERNEL_NO_MULT_STRIDE_9 ":\n"  // Parent Loop BB227_19 Depth=1
         // =>  This Loop Header: Depth=2
         // Child Loop BB227_5 Depth 3
         // Child Loop BB227_7 Depth 3
         "cmp    x23, #2\n"  // =2
-        "b.ne   DC_KERNEL_NO_MULT_STRIDE_3\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
         // %bb.10:        // in Loop: Header=BB227_19 Depth=1
         "ldp    %[filter_workspace], x24, [sp, #16]\n"  // 8-byte Folded Reload
         "ldr    %[scratch_block_data], [sp]\n"  // 8-byte Folded Reload
         "ldr    w25, [sp, #12]\n"  // 4-byte Folded Reload
         "mov    %[output_block_data], x16\n"
         "mov    x26, x12\n"
-        "b      DC_KERNEL_NO_MULT_STRIDE_18\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_11:\n"  // in Loop: Header=BB227_19 Depth=1
+        "b      " DC_KERNEL_NO_MULT_STRIDE_18 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_11 ":\n"  // in Loop: Header=BB227_19 Depth=1
         "mul    w15, w26, w8\n"
         "add    x15, %[scratch_block_data], w15, sxtw\n"
         "add    x12, x15, x10\n"
@@ -8032,11 +8109,11 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ldp    q30, q31, [%[bias_data]]\n"
         "ldp    q13, q11, [x15]\n"
         "add    x21, x15, #32\n"  // =32
-        "b      DC_KERNEL_NO_MULT_STRIDE_17\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_12:\n"  // in Loop: Header=BB227_17 Depth=2
+        "b      " DC_KERNEL_NO_MULT_STRIDE_17 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_12 ":\n"  // in Loop: Header=BB227_17 Depth=2
         "cmp    w11, w14\n"
         "ccmp   x19, x22, #0, eq\n"
-        "b.eq   DC_KERNEL_NO_MULT_STRIDE_14\n"
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_14 "f\n"
         // %bb.13:        // in Loop: Header=BB227_17 Depth=2
         "and    x15, x22, #0xffffffe0\n"
         "add    x15, x21, x15\n"
@@ -8045,7 +8122,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ldp    q5, q7, [x15]\n"
         "ldp    q6, q17, [x12]\n"
         "ldp    q16, q18, [x16]\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_14:\n"  // in Loop: Header=BB227_17 Depth=2
+        DC_KERNEL_NO_MULT_STRIDE_14 ":\n"  // in Loop: Header=BB227_17 Depth=2
         "mov    v14.16b, v30.16b\n"
         "mov    v15.16b, v31.16b\n"
         ".word 0x4e8d970e  // sdot   v14.4s, v24.16b, v13.16b\n"
@@ -8079,7 +8156,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "umin   v14.8b, v14.8b, v2.8b\n"
         "trn1   v12.8h, v12.8h, v18.8h\n"
         "str    d14, [x20]\n"
-        "b.eq   DC_KERNEL_NO_MULT_STRIDE_16\n"
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_16 "f\n"
         // %bb.15:        // in Loop: Header=BB227_17 Depth=2
         "mov    v14.16b, v30.16b\n"
         ".word 0x4e8d970e  // sdot   v14.4s, v24.16b, v13.16b\n"
@@ -8106,24 +8183,24 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "mov    v11.16b, v7.16b\n"
         "mov    v9.16b, v17.16b\n"
         "mov    v12.16b, v18.16b\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_16:\n"  // in Loop: Header=BB227_17 Depth=2
+        DC_KERNEL_NO_MULT_STRIDE_16 ":\n"  // in Loop: Header=BB227_17 Depth=2
         "add    x23, x23, #1\n"  // =1
         "add    x20, x20, x6\n"
         "add    x22, x22, #32\n"  // =32
-        "   DC_KERNEL_NO_MULT_STRIDE_17:\n"  // Parent Loop BB227_19 Depth=1
+        DC_KERNEL_NO_MULT_STRIDE_17 ":\n"  // Parent Loop BB227_19 Depth=1
         // =>  This Inner Loop Header: Depth=2
         "cmp    x23, x11\n"
-        "b.lt   DC_KERNEL_NO_MULT_STRIDE_12\n"
-        "   DC_KERNEL_NO_MULT_STRIDE_18:\n"  // in Loop: Header=BB227_19 Depth=1
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_12 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_18 ":\n"  // in Loop: Header=BB227_19 Depth=1
         "add    %[bias_data], x2, #32\n"  // =32
         "add    x8, x8, #1\n"  // =1
-        "   DC_KERNEL_NO_MULT_STRIDE_19:\n"  // =>This Loop Header: Depth=1
+        DC_KERNEL_NO_MULT_STRIDE_19 ":\n"  // =>This Loop Header: Depth=1
         // Child Loop BB227_17 Depth 2
         // Child Loop BB227_9 Depth 2
         // Child Loop BB227_5 Depth 3
         // Child Loop BB227_7 Depth 3
         "cmp    x8, x24\n"
-        "b.lt   DC_KERNEL_NO_MULT_STRIDE_1\n"
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_1 "b\n"
         // %bb.20:
         "add    sp, sp, #32\n"  // =192
         :
@@ -8149,6 +8226,26 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "x26", "x27", "x28", "x29", "x30");
   }  // NOLINT(readability/fn_size) Manually unrolled.
 
+#undef DC_KERNEL_NO_MULT_STRIDE_1
+#undef DC_KERNEL_NO_MULT_STRIDE_2
+#undef DC_KERNEL_NO_MULT_STRIDE_3
+#undef DC_KERNEL_NO_MULT_STRIDE_4
+#undef DC_KERNEL_NO_MULT_STRIDE_5
+#undef DC_KERNEL_NO_MULT_STRIDE_6
+#undef DC_KERNEL_NO_MULT_STRIDE_7
+#undef DC_KERNEL_NO_MULT_STRIDE_8
+#undef DC_KERNEL_NO_MULT_STRIDE_9
+#undef DC_KERNEL_NO_MULT_STRIDE_10
+#undef DC_KERNEL_NO_MULT_STRIDE_11
+#undef DC_KERNEL_NO_MULT_STRIDE_12
+#undef DC_KERNEL_NO_MULT_STRIDE_13
+#undef DC_KERNEL_NO_MULT_STRIDE_14
+#undef DC_KERNEL_NO_MULT_STRIDE_15
+#undef DC_KERNEL_NO_MULT_STRIDE_16
+#undef DC_KERNEL_NO_MULT_STRIDE_17
+#undef DC_KERNEL_NO_MULT_STRIDE_18
+#undef DC_KERNEL_NO_MULT_STRIDE_19
+
   static void __attribute__((noinline))
   Run(const int8* scratch_block_data, const int8* filter_workspace,
       const int32* bias_data, uint8* output_block_data,
@@ -8172,6 +8269,28 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     // x2 %[bias_data]
     // x3 %[output_block_data]
     // x4 %[function_params]
+#define DC_KERNEL_MULT_1 "1"
+#define DC_KERNEL_MULT_2 "2"
+#define DC_KERNEL_MULT_3 "3"
+#define DC_KERNEL_MULT_4 "4"
+#define DC_KERNEL_MULT_5 "5"
+#define DC_KERNEL_MULT_6 "6"
+#define DC_KERNEL_MULT_7 "7"
+#define DC_KERNEL_MULT_8 "8"
+#define DC_KERNEL_MULT_9 "9"
+#define DC_KERNEL_MULT_10 "10"
+#define DC_KERNEL_MULT_11 "11"
+#define DC_KERNEL_MULT_12 "12"
+#define DC_KERNEL_MULT_13 "13"
+#define DC_KERNEL_MULT_14 "14"
+#define DC_KERNEL_MULT_15 "15"
+#define DC_KERNEL_MULT_16 "16"
+#define DC_KERNEL_MULT_17 "17"
+#define DC_KERNEL_MULT_18 "18"
+#define DC_KERNEL_MULT_19 "19"
+#define DC_KERNEL_MULT_20 "20"
+#define DC_KERNEL_MULT_21 "21"
+#define DC_KERNEL_MULT_22 "22"
 
     asm volatile(
         "sub    sp, sp, #160\n"  // =288
@@ -8254,8 +8373,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    x16, x12, #4\n"  // =4
         "str    x12, [sp, #64]\n"  // 8-byte Folded Spill
         "str    %[output_block_data], [sp, #24]\n"  // 8-byte Folded Spill
-        "b      DC_KERNEL_MULT_22\n"
-        "   DC_KERNEL_MULT_1:\n"  // in Loop: Header=BB205_22 Depth=1
+        "b      " DC_KERNEL_MULT_22 "f\n"
+        DC_KERNEL_MULT_1 ":\n"  // in Loop: Header=BB205_22 Depth=1
         "ldr    x12, [sp, #16]\n"  // 8-byte Folded Reload
         "str    w7, [sp, #12]\n"  // 4-byte Folded Spill
         "ldr    x13, [sp, #88]\n"  // 8-byte Folded Reload
@@ -8266,12 +8385,12 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "cmp    w13, #4\n"  // =4
         "str    x12, [sp, #16]\n"  // 8-byte Folded Spill
         "mov    x12, xzr\n"
-        "b.ne   DC_KERNEL_MULT_12\n"
+        "b.ne   " DC_KERNEL_MULT_12 "f\n"
         // %bb.2:        // in Loop: Header=BB205_22 Depth=1
         "ldp    x19, x13, [sp, #32]\n"  // 16-byte Folded Reload
         "str    x13, [sp, #120]\n"  // 8-byte Folded Spill
-        "b      DC_KERNEL_MULT_11\n"
-        "   DC_KERNEL_MULT_3:\n"  // in Loop: Header=BB205_11 Depth=2
+        "b      " DC_KERNEL_MULT_11 "f\n"
+        DC_KERNEL_MULT_3 ":\n"  // in Loop: Header=BB205_11 Depth=2
         "str    x12, [sp, #112]\n"  // 8-byte Folded Spill
         "ldr    w12, [%[scratch_block_data]]\n"
         "add    %[output_block_data], %[scratch_block_data], x11\n"
@@ -8311,8 +8430,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "mov    v24.s[3], w3\n"
         ".word 0x4e88965f  // sdot   v31.4s, v18.16b, v8.16b\n"
         "mov    x12, x19\n"
-        "b      DC_KERNEL_MULT_5\n"
-        "   DC_KERNEL_MULT_4:\n"  // in Loop: Header=BB205_5 Depth=3
+        "b      " DC_KERNEL_MULT_5 "f\n"
+        DC_KERNEL_MULT_4 ":\n"  // in Loop: Header=BB205_5 Depth=3
         ".word 0x4f95e25c  // sdot   v28.4s, v18.16b, v21.4b[0]\n"
         ".word 0x4f95ea5d  // sdot   v29.4s, v18.16b, v21.4b[2]\n"
         ".word 0x4f97ea7e  // sdot   v30.4s, v19.16b, v23.4b[2]\n"
@@ -8503,18 +8622,18 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         ".word 0x4f97ea5f  // sdot   v31.4s, v18.16b, v23.4b[2]\n"
         "st1    { v8.s }[3], [%[output_block_data]]\n"
         "add    x12, x12, x10\n"
-        "   DC_KERNEL_MULT_5:\n"  // Parent Loop BB205_22 Depth=1
+        DC_KERNEL_MULT_5 ":\n"  // Parent Loop BB205_22 Depth=1
         // Parent Loop BB205_11 Depth=2
         // =>  This Inner Loop Header: Depth=3
         "cmp    w13, w9\n"
-        "b.lt   DC_KERNEL_MULT_4\n"
+        "b.lt   " DC_KERNEL_MULT_4 "b\n"
         // %bb.6:        // in Loop: Header=BB205_11 Depth=2
         "ldr    %[output_block_data], [sp, #120]\n"  // 8-byte Folded Reload
         "cmp    w13, w25\n"
         "str    x19, [sp, #104]\n"  // 8-byte Folded Spill
         "add    %[output_block_data], x3, #16\n"  // =16
         "str    %[output_block_data], [sp, #120]\n"  // 8-byte Folded Spill
-        "b.ge   DC_KERNEL_MULT_10\n"
+        "b.ge   " DC_KERNEL_MULT_10 "f\n"
         // %bb.7:        // in Loop: Header=BB205_11 Depth=2
         "add    x7, %[scratch_block_data], x13, lsl #2\n"
         "add    x19, x23, x13, lsl #2\n"
@@ -8536,8 +8655,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    x7, %[function_params], x12\n"
         "add    x19, x5, x12\n"
         "add    x12, x20, x12\n"
-        "b      DC_KERNEL_MULT_9\n"
-        "   DC_KERNEL_MULT_8:\n"  // in Loop: Header=BB205_9 Depth=3
+        "b      " DC_KERNEL_MULT_9 "f\n"
+        DC_KERNEL_MULT_8 ":\n"  // in Loop: Header=BB205_9 Depth=3
         ".word 0x4f95e25c  // sdot   v28.4s, v18.16b, v21.4b[0]\n"
         ".word 0x4f95ea5d  // sdot   v29.4s, v18.16b, v21.4b[2]\n"
         ".word 0x4f97ea7e  // sdot   v30.4s, v19.16b, v23.4b[2]\n"
@@ -8584,32 +8703,32 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         ".word 0x4f97e25e  // sdot   v30.4s, v18.16b, v23.4b[0]\n"
         ".word 0x4f97ea5f  // sdot   v31.4s, v18.16b, v23.4b[2]\n"
         "add    %[output_block_data], x3, x15\n"
-        "   DC_KERNEL_MULT_9:\n"  // Parent Loop BB205_22 Depth=1
+        DC_KERNEL_MULT_9 ":\n"  // Parent Loop BB205_22 Depth=1
         // Parent Loop BB205_11 Depth=2
         // =>  This Inner Loop Header: Depth=3
         "cmp    w6, w17\n"
-        "b.lt   DC_KERNEL_MULT_8\n"
-        "   DC_KERNEL_MULT_10:\n"  // in Loop: Header=BB205_11 Depth=2
+        "b.lt   " DC_KERNEL_MULT_8 "b\n"
+        DC_KERNEL_MULT_10 ":\n"  // in Loop: Header=BB205_11 Depth=2
         "ldp    x19, x12, [sp, #104]\n"  // 16-byte Folded Reload
         "mov    v20.16b, v17.16b\n"
         "mov    v19.16b, v16.16b\n"
         "mov    v18.16b, v7.16b\n"
         "add    x12, x12, #1\n"  // =1
         "add    x19, x19, #4\n"  // =4
-        "   DC_KERNEL_MULT_11:\n"  // Parent Loop BB205_22 Depth=1
+        DC_KERNEL_MULT_11 ":\n"  // Parent Loop BB205_22 Depth=1
         // =>  This Loop Header: Depth=2
         // Child Loop BB205_5 Depth 3
         // Child Loop BB205_9 Depth 3
         "cmp    x12, #2\n"  // =2
-        "b.ne   DC_KERNEL_MULT_3\n"
-        "b      DC_KERNEL_MULT_21\n"
-        "   DC_KERNEL_MULT_12:\n"  // in Loop: Header=BB205_22 Depth=1
+        "b.ne   " DC_KERNEL_MULT_3 "b\n"
+        "b      " DC_KERNEL_MULT_21 "f\n"
+        DC_KERNEL_MULT_12 ":\n"  // in Loop: Header=BB205_22 Depth=1
         "ldr    x13, [sp, #40]\n"  // 8-byte Folded Reload
         "ldp    q21, q22, [x13]\n"
         "ldr    x13, [sp, #24]\n"  // 8-byte Folded Reload
         "str    x13, [sp, #120]\n"  // 8-byte Folded Spill
-        "b      DC_KERNEL_MULT_20\n"
-        "   DC_KERNEL_MULT_13:\n"  // in Loop: Header=BB205_20 Depth=2
+        "b      " DC_KERNEL_MULT_20 "f\n"
+        DC_KERNEL_MULT_13 ":\n"  // in Loop: Header=BB205_20 Depth=2
         "madd   x6, x12, x11, %[scratch_block_data]\n"
         "ldr    w13, [x6]\n"
         "add    x7, x6, x11\n"
@@ -8621,8 +8740,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ld1r   { v24.4s }, [x7]\n"
         "ldr    x7, [sp, #120]\n"  // 8-byte Folded Reload
         "mov    v23.s[3], w13\n"
-        "b      DC_KERNEL_MULT_18\n"
-        "   DC_KERNEL_MULT_14:\n"  // in Loop: Header=BB205_18 Depth=3
+        "b      " DC_KERNEL_MULT_18 "f\n"
+        DC_KERNEL_MULT_14 ":\n"  // in Loop: Header=BB205_18 Depth=3
         "add    x6, x6, #4\n"  // =4
         "mov    x13, x6\n"
         "ld1    { v23.s }[1], [x13], x8\n"
@@ -8633,8 +8752,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ld1    { v24.s }[1], [x13]\n"
         "orr    w13, wzr, #0x4\n"
         "csel   w13, w17, w13, eq\n"
-        "b      DC_KERNEL_MULT_16\n"
-        "   DC_KERNEL_MULT_15:\n"  // in Loop: Header=BB205_16 Depth=4
+        "b      " DC_KERNEL_MULT_16 "f\n"
+        DC_KERNEL_MULT_15 ":\n"  // in Loop: Header=BB205_16 Depth=4
         "mov    v25.16b, v21.16b\n"
         "mov    v26.16b, v22.16b\n"
         ".word 0x4f97e259  // sdot   v25.4s, v18.16b, v23.4b[0]\n"
@@ -8658,34 +8777,34 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "str    d25, [x7]\n"
         "add    x7, x7, x15\n"
         "add    w19, w19, #1\n"  // =1
-        "   DC_KERNEL_MULT_16:\n"  // Parent Loop BB205_22 Depth=1
+        DC_KERNEL_MULT_16 ":\n"  // Parent Loop BB205_22 Depth=1
         // Parent Loop BB205_20 Depth=2
         // Parent Loop BB205_18 Depth=3
         // =>  This Inner Loop Header: Depth=4
         "cmp    w19, w13\n"
-        "b.lt   DC_KERNEL_MULT_15\n"
+        "b.lt   " DC_KERNEL_MULT_15 "b\n"
         // %bb.17:        // in Loop: Header=BB205_18 Depth=3
         "add    w3, w3, #1\n"  // =1
-        "   DC_KERNEL_MULT_18:\n"  // Parent Loop BB205_22 Depth=1
+        DC_KERNEL_MULT_18 ":\n"  // Parent Loop BB205_22 Depth=1
         // Parent Loop BB205_20 Depth=2
         // =>  This Loop Header: Depth=3
         // Child Loop BB205_16 Depth 4
         "cmp    w3, w25\n"
-        "b.lt   DC_KERNEL_MULT_14\n"
+        "b.lt   " DC_KERNEL_MULT_14 "b\n"
         // %bb.19:        // in Loop: Header=BB205_20 Depth=2
         "ldr    x13, [sp, #80]\n"  // 8-byte Folded Reload
         "ldr    %[output_block_data], [sp, #120]\n"  // 8-byte Folded Reload
         "add    x12, x12, #1\n"  // =1
         "add    %[output_block_data], x3, x13\n"
         "str    %[output_block_data], [sp, #120]\n"  // 8-byte Folded Spill
-        "   DC_KERNEL_MULT_20:\n"  // Parent Loop BB205_22 Depth=1
+        DC_KERNEL_MULT_20 ":\n"  // Parent Loop BB205_22 Depth=1
         // =>  This Loop Header: Depth=2
         // Child Loop BB205_18 Depth 3
         // Child Loop BB205_16 Depth 4
         "ldr    x13, [sp, #88]\n"  // 8-byte Folded Reload
         "cmp    x12, x13\n"
-        "b.lt   DC_KERNEL_MULT_13\n"
-        "   DC_KERNEL_MULT_21:\n"  // in Loop: Header=BB205_22 Depth=1
+        "b.lt   " DC_KERNEL_MULT_13 "b\n"
+        DC_KERNEL_MULT_21 ":\n"  // in Loop: Header=BB205_22 Depth=1
         "ldr    x12, [sp, #40]\n"  // 8-byte Folded Reload
         "ldr    w7, [sp, #12]\n"  // 4-byte Folded Reload
         "add    x12, x12, #32\n"  // =32
@@ -8697,7 +8816,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ldr    x12, [sp, #32]\n"  // 8-byte Folded Reload
         "add    x12, x12, #8\n"  // =8
         "str    x12, [sp, #32]\n"  // 8-byte Folded Spill
-        "   DC_KERNEL_MULT_22:\n"  // =>This Loop Header: Depth=1
+        DC_KERNEL_MULT_22 ":\n"  // =>This Loop Header: Depth=1
         // Child Loop BB205_20 Depth 2
         // Child Loop BB205_18 Depth 3
         // Child Loop BB205_16 Depth 4
@@ -8706,7 +8825,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         // Child Loop BB205_9 Depth 3
         "ldr    w12, [sp, #8]\n"  // 4-byte Folded Reload
         "cmp    w7, w12\n"
-        "b.lt   DC_KERNEL_MULT_1\n"
+        "b.lt   " DC_KERNEL_MULT_1 "b\n"
         // %bb.23:
         "add    sp, sp, #160\n"  // =288
         :
@@ -8732,6 +8851,29 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "x26", "x27", "x28", "x29", "x30");
   }  // NOLINT(readability/fn_size) Manually unrolled.
 
+#undef DC_KERNEL_MULT_1
+#undef DC_KERNEL_MULT_2
+#undef DC_KERNEL_MULT_3
+#undef DC_KERNEL_MULT_4
+#undef DC_KERNEL_MULT_5
+#undef DC_KERNEL_MULT_6
+#undef DC_KERNEL_MULT_7
+#undef DC_KERNEL_MULT_8
+#undef DC_KERNEL_MULT_9
+#undef DC_KERNEL_MULT_10
+#undef DC_KERNEL_MULT_11
+#undef DC_KERNEL_MULT_12
+#undef DC_KERNEL_MULT_13
+#undef DC_KERNEL_MULT_14
+#undef DC_KERNEL_MULT_15
+#undef DC_KERNEL_MULT_16
+#undef DC_KERNEL_MULT_17
+#undef DC_KERNEL_MULT_18
+#undef DC_KERNEL_MULT_19
+#undef DC_KERNEL_MULT_20
+#undef DC_KERNEL_MULT_21
+#undef DC_KERNEL_MULT_22
+
   static void __attribute__((noinline))
   Run(const int8* scratch_block_data, const int8* filter_workspace,
       const int32* bias_data, uint8* output_block_data,
@@ -8755,6 +8897,19 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     // x2 %[bias_data]
     // x3 %[output_block_data]
     // x4 %[function_params]
+#define DC_KERNEL_MULT_STRIDE_1 "1"
+#define DC_KERNEL_MULT_STRIDE_2 "2"
+#define DC_KERNEL_MULT_STRIDE_3 "3"
+#define DC_KERNEL_MULT_STRIDE_4 "4"
+#define DC_KERNEL_MULT_STRIDE_5 "5"
+#define DC_KERNEL_MULT_STRIDE_6 "6"
+#define DC_KERNEL_MULT_STRIDE_7 "7"
+#define DC_KERNEL_MULT_STRIDE_8 "8"
+#define DC_KERNEL_MULT_STRIDE_9 "9"
+#define DC_KERNEL_MULT_STRIDE_10 "10"
+#define DC_KERNEL_MULT_STRIDE_11 "11"
+#define DC_KERNEL_MULT_STRIDE_12 "12"
+#define DC_KERNEL_MULT_STRIDE_13 "13"
 
     asm volatile(
         "ldr    w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
@@ -8784,8 +8939,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "add    %[function_params], x10, x10, lsl #1\n"
         "sxtw   x6, w6\n"
         "add    x7, x9, x13\n"
-        "b      DC_KERNEL_MULT_STRIDE_MULT_STRIDE_13\n"
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_1:\n"  // in Loop: Header=BB206_13 Depth=1
+        "b      " DC_KERNEL_MULT_STRIDE_13 "f\n"
+        DC_KERNEL_MULT_STRIDE_1 ":\n"  // in Loop: Header=BB206_13 Depth=1
         "ldr    w20, [%[scratch_block_data]]\n"
         "add    x21, %[scratch_block_data], x10\n"
         "ldp    q5, q6, [%[filter_workspace]]\n"
@@ -8802,7 +8957,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "cmp    w14, #2\n"  // =2
         "mov    v21.s[3], w20\n"
         "mov    x20, xzr\n"
-        "b.ne   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_7\n"
+        "b.ne   " DC_KERNEL_MULT_STRIDE_7 "f\n"
         // %bb.2:        // in Loop: Header=BB206_13 Depth=1
         "dup    v22.4s, v22.s[0]\n"
         "add    x21, %[scratch_block_data], %[function_params]\n"
@@ -8810,8 +8965,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "ld1    { v22.s }[2], [x21]\n"
         "ld1r   { v23.4s }, [x22]\n"
         "mov    x21, xzr\n"
-        "b      DC_KERNEL_MULT_STRIDE_MULT_STRIDE_4\n"
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_3:\n"  // in Loop: Header=BB206_4 Depth=2
+        "b      " DC_KERNEL_MULT_STRIDE_4 "f\n"
+        DC_KERNEL_MULT_STRIDE_3 ":\n"  // in Loop: Header=BB206_4 Depth=2
         "and    x22, x20, #0xfffffffc\n"
         "add    x23, x16, x22\n"
         "lsl    x24, x10, #2\n"
@@ -8918,12 +9073,12 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "str    s24, [x23, #4]\n"
         "st1    { v24.s }[1], [x25]\n"
         "add    x20, x20, #4\n"  // =4
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_4:\n"  // Parent Loop BB206_13 Depth=1
+        DC_KERNEL_MULT_STRIDE_4 ":\n"  // Parent Loop BB206_13 Depth=1
         // =>  This Inner Loop Header: Depth=2
         "cmp    x21, x6\n"
-        "b.lt   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_3\n"
-        "b      DC_KERNEL_MULT_STRIDE_MULT_STRIDE_6\n"
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_5:\n"  // in Loop: Header=BB206_6 Depth=2
+        "b.lt   " DC_KERNEL_MULT_STRIDE_3 "b\n"
+        "b      " DC_KERNEL_MULT_STRIDE_6 "f\n"
+        DC_KERNEL_MULT_STRIDE_5 ":\n"  // in Loop: Header=BB206_6 Depth=2
         "and    x22, x20, #0xfffffffc\n"
         "add    x22, x16, x22\n"
         "lsl    x23, x10, #2\n"
@@ -8984,16 +9139,16 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "st1    { v24.s }[1], [x22]\n"
         "add    x19, x19, x13\n"
         "add    x20, x20, #4\n"  // =4
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_6:\n"  // Parent Loop BB206_13 Depth=1
+        DC_KERNEL_MULT_STRIDE_6 ":\n"  // Parent Loop BB206_13 Depth=1
         // =>  This Inner Loop Header: Depth=2
         "cmp    x21, x11\n"
-        "b.lt   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_5\n"
-        "b      DC_KERNEL_MULT_STRIDE_MULT_STRIDE_12\n"
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_7:\n"  // in Loop: Header=BB206_13 Depth=1
+        "b.lt   " DC_KERNEL_MULT_STRIDE_5 "b\n"
+        "b      " DC_KERNEL_MULT_STRIDE_12 "f\n"
+        DC_KERNEL_MULT_STRIDE_7 ":\n"  // in Loop: Header=BB206_13 Depth=1
         "mov    x21, xzr\n"
         "dup    v22.4s, v22.s[0]\n"
-        "b      DC_KERNEL_MULT_STRIDE_MULT_STRIDE_11\n"
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_8:\n"  // in Loop: Header=BB206_11 Depth=2
+        "b      " DC_KERNEL_MULT_STRIDE_11 "f\n"
+        DC_KERNEL_MULT_STRIDE_8 ":\n"  // in Loop: Header=BB206_11 Depth=2
         "and    x22, x20, #0xfffffffc\n"
         "add    x22, x16, x22\n"
         "mov    x23, x22\n"
@@ -9025,7 +9180,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "str    d24, [x19]\n"
         "ushr   v24.2d, v22.2d, #16\n"
         "add    x19, x19, x13\n"
-        "b.eq   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_10\n"
+        "b.eq   " DC_KERNEL_MULT_STRIDE_10 "f\n"
         // %bb.9:        // in Loop: Header=BB206_11 Depth=2
         "mov    v25.16b, v19.16b\n"
         "mov    v26.16b, v20.16b\n"
@@ -9049,23 +9204,23 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "umin   v21.8b, v21.8b, v4.8b\n"
         "str    d21, [x19]\n"
         "add    x19, x19, x13\n"
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_10:\n"  // in Loop: Header=BB206_11 Depth=2
+        DC_KERNEL_MULT_STRIDE_10 ":\n"  // in Loop: Header=BB206_11 Depth=2
         "add    x21, x21, #1\n"  // =1
         "add    x20, x20, #4\n"  // =4
         "mov    v22.16b, v24.16b\n"
         "mov    v21.16b, v23.16b\n"
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_11:\n"  // Parent Loop BB206_13 Depth=1
+        DC_KERNEL_MULT_STRIDE_11 ":\n"  // Parent Loop BB206_13 Depth=1
         // =>  This Inner Loop Header: Depth=2
         "cmp    x21, x11\n"
-        "b.lt   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_8\n"
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_12:\n"  // in Loop: Header=BB206_13 Depth=1
+        "b.lt   " DC_KERNEL_MULT_STRIDE_8 "b\n"
+        DC_KERNEL_MULT_STRIDE_12 ":\n"  // in Loop: Header=BB206_13 Depth=1
         "add    x8, x8, #1\n"  // =1
-        "   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_13:\n"  // =>This Loop Header: Depth=1
+        DC_KERNEL_MULT_STRIDE_13 ":\n"  // =>This Loop Header: Depth=1
         // Child Loop BB206_11 Depth 2
         // Child Loop BB206_4 Depth 2
         // Child Loop BB206_6 Depth 2
         "cmp    x8, x12\n"
-        "b.lt   DC_KERNEL_MULT_STRIDE_MULT_STRIDE_1\n"
+        "b.lt   " DC_KERNEL_MULT_STRIDE_1 "b\n"
         :
         // Outputs.
         [ scratch_block_data ] "+r"(scratch_block_data),
@@ -9087,6 +9242,20 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25");
   }
 
+#undef DC_KERNEL_MULT_STRIDE_1
+#undef DC_KERNEL_MULT_STRIDE_2
+#undef DC_KERNEL_MULT_STRIDE_3
+#undef DC_KERNEL_MULT_STRIDE_4
+#undef DC_KERNEL_MULT_STRIDE_5
+#undef DC_KERNEL_MULT_STRIDE_6
+#undef DC_KERNEL_MULT_STRIDE_7
+#undef DC_KERNEL_MULT_STRIDE_8
+#undef DC_KERNEL_MULT_STRIDE_9
+#undef DC_KERNEL_MULT_STRIDE_10
+#undef DC_KERNEL_MULT_STRIDE_11
+#undef DC_KERNEL_MULT_STRIDE_12
+#undef DC_KERNEL_MULT_STRIDE_13
+
   static void __attribute__((noinline))
   Run(const int8* scratch_block_data, const int8* filter_workspace,
       const int32* bias_data, uint8* output_block_data,
@@ -9133,7 +9302,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 //
 #undef DP_OFFSET_FOUR_OVER_STRIDE
 
-#endif  // __ARM_FEATURE_DOTPROD && !GOOGLE_L4T
+#endif  // __aarch64__ && !GOOGLE_L4T - Dot product ops hard-coded
 
 // Top-level implementation function for 3x3 depthwise convolution using NEON
 // dot-product instructions.
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index 1edf7362813..03102b24b81 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -1891,7 +1891,31 @@ inline void DepthwiseConvWithRounding(
   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
 
-  // TODO(renjieliu): Optimize for 3x3 filter case.
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+          optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+          input_shape, filter_shape, stride_width, stride_height,
+          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+          depth_multiplier, output_shape, 0, output_shift)) {
+    gemmlowp::ScopedProfilingLabel specialized_label(
+        "DepthwiseConvInt8/8bit/3x3");
+    optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
+        DepthwiseConvOutputRounding::kUpward>(
+        params, output_multiplier, output_shift, input_shape, input_data,
+        filter_shape, filter_data, bias_shape, bias_data, output_shape,
+        output_data, thread_start, thread_end, thread_dim);
+    return;
+  }
+#endif
 
   gemmlowp::ScopedProfilingLabel specialized_label(
       "DepthwiseConvInt8/8bit/General");
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
index 3c92e0fcd1a..40638454be9 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@@ -121,10 +121,12 @@ template <>
 struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
                                      1> {
  public:
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
-                         int64_t input_depth, int64_t input_row_size,
-                         int32 output_window_height, int32 output_window_width,
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr, int64_t input_depth,
+                         int64_t input_row_size, int32 output_window_height,
+                         int32 output_window_width,
                          const DepthwiseConvParams* params_ptr) {
     const int64_t input_width_increment = 2 * input_depth;
     const int64_t input_height_increment = 2 * input_row_size;
@@ -172,7 +174,27 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
         // interleaved with arithmetic operations to take advantage of
         // dual-issue pipelines. We also add input offsets as far from the loads
         // as possible to give loads enough cycles to fetch data from memory.
-
+        //
+        // This logic is copied and modified from the non-per-channel quantized
+        // part.
+        // However, the challenges are how to plan the registers allocation
+        // wisely: 25 NEON registers are already reserved for inputs, filters,
+        // and outputs; also, 2 registers (v30, v31) are used for output
+        // min/max, while another 2 registers (v26, v29) are used for input
+        // offset & output offset, so that's total 25 + 2 + 2 = 29 already.
+        // But we need 4 more registers to hold the output multiplier & output
+        // right shift (we only have 3).
+        //
+        // So here's the plan:
+        // v27 (which held duplicated output multiplier previously) will hold
+        // the first 4 values of the output_multiplier_ptr (we have 8 in total);
+        // v30 (which held duplicated output right shift previously) will hold
+        // the first 4 values of the output_shift_ptr (we have 8 in total);
+        // lastly, v28 will hold the last 4 values of output_mulitplier and v31
+        // (previously occupied by activations) will hold the last 4 values of
+        // output_shift. Then v25 will be used for output activation min while
+        // output activation max will just reuse oother registers, like v24.
+        //
         // Set "constant" registers. These registers may be replaced with temp
         // values from time to time when there are not enough NEON registers.
         // We use x9--x15 general purpose registers as they are caller-saved
@@ -182,40 +204,39 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
         "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
         "cmp %w[output_window_height], #2\n"
         "dup v26.8h, w9\n"
-        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
         "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
-        "dup v27.4s, w9\n"
-        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v29.8h, w2\n"
         "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "dup v30.16b, w4\n"
         "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
-        "dup v31.16b, w0\n"
-        "dup v28.4s, w9\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
         "add x10, %[bias_ptr], #16\n"
         "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
         "dup v9.8h, w9\n"
+        "dup v25.16b, w4\n"
+
+        // Deal with output multiplier & output shift.
+        "ld1 {v27.4s, v28.4s}, [%[output_multiplier_ptr]]\n"
+        "ld1 {v30.4s, v31.4s}, [%[output_shift_ptr]]\n"
 
         // Load filters and add offsets.
         "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
         "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
-        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "saddw v0.8h, v9.8h, v0.8b\n"
         "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
-        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "saddw v1.8h, v9.8h, v1.8b\n"
         "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
-        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "saddw v2.8h, v9.8h, v2.8b\n"
         "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
-        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "saddw v3.8h, v9.8h, v3.8b\n"
         "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
-        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "saddw v4.8h, v9.8h, v4.8b\n"
         "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
-        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "saddw v5.8h, v9.8h, v5.8b\n"
         "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
-        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "saddw v6.8h, v9.8h, v6.8b\n"
         "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
-        "uaddw v7.8h, v9.8h, v7.8b\n"
-        "uaddw v8.8h, v9.8h, v8.8b\n"
+        "saddw v7.8h, v9.8h, v7.8b\n"
+        "saddw v8.8h, v9.8h, v8.8b\n"
 
         "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
 
@@ -245,27 +266,27 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
           // loads, otherwise jump to specific the appropriate label to handle
           // smaller widths.
           "cmp w5, #2\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
           "ld1 {v16.8b}, [x14], %[input_depth]\n"
-          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
           "ld1 {v17.8b}, [x14], %[input_depth]\n"
-          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
           "ld1 {v18.8b}, [x15], %[input_depth]\n"
-          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
           "ld1 {v19.8b}, [x15], %[input_depth]\n"
-          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
           "ld1 {v20.8b}, [x15], %[input_depth]\n"
-          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "saddw v14.8h, v26.8h, v14.8b\n"
           "ld1 {v21.4s}, [%[bias_ptr]]\n"
-          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
           "ld1 {v22.4s}, [x10]\n"
-          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
           "ld1 {v23.4s}, [%[bias_ptr]]\n"
-          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "saddw v17.8h, v26.8h, v17.8b\n"
           "ld1 {v24.4s}, [x10]\n"
-          "uaddw v18.8h, v26.8h, v18.8b\n"
-          "uaddw v19.8h, v26.8h, v19.8b\n"
-          "uaddw v20.8h, v26.8h, v20.8b\n"
+          "saddw v18.8h, v26.8h, v18.8b\n"
+          "saddw v19.8h, v26.8h, v19.8b\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
 
           "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
           "cmp w5, #1\n"
@@ -318,33 +339,34 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
             "smlal2 v24.4s, v8.8h, v20.8h\n"
 
             "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v28.4s\n"
             "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-            "sqrshl v21.4s, v21.4s, v28.4s\n"
-            "sqrshl v22.4s, v22.4s, v28.4s\n"
-            "sqrshl v23.4s, v23.4s, v28.4s\n"
-            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+            "sqrshl v21.4s, v21.4s, v30.4s\n"
+            "sqrshl v22.4s, v22.4s, v31.4s\n"
+            "sqrshl v23.4s, v23.4s, v30.4s\n"
+            "sqrshl v24.4s, v24.4s, v31.4s\n"
             "sqxtn v21.4h, v21.4s\n"
             "sqxtn2 v21.8h, v22.4s\n"
             "sqxtn v23.4h, v23.4s\n"
             "sqxtn2 v23.8h, v24.4s\n"
             "sqadd v21.8h, v21.8h, v29.8h\n"
             "sqadd v23.8h, v23.8h, v29.8h\n"
-            "sqxtun v21.8b, v21.8h\n"
-            "sqxtun2 v21.16b, v23.8h\n"
+            "sqxtn v21.8b, v21.8h\n"
+            "sqxtn2 v21.16b, v23.8h\n"
+            "dup v24.16b, w0\n"
             "ld1 {v22.4s}, [x10]\n"
-            "umax v21.16b, v21.16b, v30.16b\n"
-            "umin v21.16b, v21.16b, v31.16b\n"
+            "smax v21.16b, v21.16b, v25.16b\n"
+            "smin v21.16b, v21.16b, v24.16b\n"
             "ld1 {v24.4s}, [x10]\n"
-            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "saddw v9.8h, v26.8h, v9.8b\n"
             "st1 {v21.8b}, [x6], x3\n"
-            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "saddw v12.8h, v26.8h, v12.8b\n"
             "mov v23.d[0], v21.d[1]\n"
             "st1 {v23.8b}, [x7], x3\n"
-            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "saddw v15.8h, v26.8h, v15.8b\n"
             "ld1 {v21.4s}, [%[bias_ptr]]\n"
-            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "saddw v18.8h, v26.8h, v18.8b\n"
             "ld1 {v23.4s}, [%[bias_ptr]]\n"
 
             // Mul-add right outputs.
@@ -401,44 +423,45 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
 
             "sqrdmulh v21.4s, v21.4s, v27.4s\n"
             "ld1 {v18.8b}, [x15], %[input_depth]\n"
-            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v28.4s\n"
             "ld1 {v19.8b}, [x15], %[input_depth]\n"
             "sqrdmulh v23.4s, v23.4s, v27.4s\n"
             "ld1 {v20.8b}, [x15], %[input_depth]\n"
-            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-            "sqrshl v21.4s, v21.4s, v28.4s\n"
-            "sqrshl v22.4s, v22.4s, v28.4s\n"
-            "sqrshl v23.4s, v23.4s, v28.4s\n"
-            "sqrshl v24.4s, v24.4s, v28.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+            "sqrshl v21.4s, v21.4s, v30.4s\n"
+            "sqrshl v22.4s, v22.4s, v31.4s\n"
+            "sqrshl v23.4s, v23.4s, v30.4s\n"
+            "sqrshl v24.4s, v24.4s, v31.4s\n"
             "sqxtn v21.4h, v21.4s\n"
             "sqxtn2 v21.8h, v22.4s\n"
             "sqxtn v23.4h, v23.4s\n"
             "sqxtn2 v23.8h, v24.4s\n"
             "sqadd v21.8h, v21.8h, v29.8h\n"
             "sqadd v23.8h, v23.8h, v29.8h\n"
-            "sqxtun v21.8b, v21.8h\n"
-            "sqxtun2 v21.16b, v23.8h\n"
+            "sqxtn v21.8b, v21.8h\n"
+            "sqxtn2 v21.16b, v23.8h\n"
+            "dup v24.16b, w0\n"
             "ld1 {v22.4s}, [x10]\n"
-            "umax v21.16b, v21.16b, v30.16b\n"
-            "umin v21.16b, v21.16b, v31.16b\n"
+            "smax v21.16b, v21.16b, v25.16b\n"
+            "smin v21.16b, v21.16b, v24.16b\n"
             "ld1 {v24.4s}, [x10]\n"
-            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "saddw v9.8h, v26.8h, v9.8b\n"
             "st1 {v21.8b}, [x6], x3\n"
-            "uaddw v10.8h, v26.8h, v10.8b\n"
+            "saddw v10.8h, v26.8h, v10.8b\n"
             "mov v23.d[0], v21.d[1]\n"
             "st1 {v23.8b}, [x7], x3\n"
-            "uaddw v11.8h, v26.8h, v11.8b\n"
-            "uaddw v12.8h, v26.8h, v12.8b\n"
-            "uaddw v13.8h, v26.8h, v13.8b\n"
-            "uaddw v14.8h, v26.8h, v14.8b\n"
-            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "saddw v11.8h, v26.8h, v11.8b\n"
+            "saddw v12.8h, v26.8h, v12.8b\n"
+            "saddw v13.8h, v26.8h, v13.8b\n"
+            "saddw v14.8h, v26.8h, v14.8b\n"
+            "saddw v15.8h, v26.8h, v15.8b\n"
             "ld1 {v21.4s}, [%[bias_ptr]]\n"
-            "uaddw v16.8h, v26.8h, v16.8b\n"
+            "saddw v16.8h, v26.8h, v16.8b\n"
             "ld1 {v23.4s}, [%[bias_ptr]]\n"
-            "uaddw v17.8h, v26.8h, v17.8b\n"
-            "uaddw v18.8h, v26.8h, v18.8b\n"
-            "uaddw v19.8h, v26.8h, v19.8b\n"
-            "uaddw v20.8h, v26.8h, v20.8b\n"
+            "saddw v17.8h, v26.8h, v17.8b\n"
+            "saddw v18.8h, v26.8h, v18.8b\n"
+            "saddw v19.8h, v26.8h, v19.8b\n"
+            "saddw v20.8h, v26.8h, v20.8b\n"
 
             "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
 
@@ -492,33 +515,34 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
           "smlal2 v24.4s, v8.8h, v20.8h\n"
 
           "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v28.4s\n"
           "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-          "sqrshl v21.4s, v21.4s, v28.4s\n"
-          "sqrshl v22.4s, v22.4s, v28.4s\n"
-          "sqrshl v23.4s, v23.4s, v28.4s\n"
-          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v21.4s, v21.4s, v30.4s\n"
+          "sqrshl v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v30.4s\n"
+          "sqrshl v24.4s, v24.4s, v31.4s\n"
           "sqxtn v21.4h, v21.4s\n"
           "sqxtn2 v21.8h, v22.4s\n"
           "sqxtn v23.4h, v23.4s\n"
           "sqxtn2 v23.8h, v24.4s\n"
           "sqadd v21.8h, v21.8h, v29.8h\n"
           "sqadd v23.8h, v23.8h, v29.8h\n"
-          "sqxtun v21.8b, v21.8h\n"
-          "sqxtun2 v21.16b, v23.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v24.16b, w0\n"
           "ld1 {v22.4s}, [x10]\n"
-          "umax v21.16b, v21.16b, v30.16b\n"
-          "umin v21.16b, v21.16b, v31.16b\n"
+          "smax v21.16b, v21.16b, v25.16b\n"
+          "smin v21.16b, v21.16b, v24.16b\n"
           "ld1 {v24.4s}, [x10]\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
           "st1 {v21.8b}, [x6], x3\n"
           "mov v23.d[0], v21.d[1]\n"
-          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
           "st1 {v23.8b}, [x7], x3\n"
-          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
           "ld1 {v21.4s}, [%[bias_ptr]]\n"
-          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "saddw v18.8h, v26.8h, v18.8b\n"
           "ld1 {v23.4s}, [%[bias_ptr]]\n"
 
           // Mul-add right outputs.
@@ -560,13 +584,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
           "smlal2 v24.4s, v8.8h, v18.8h\n"
 
           "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v28.4s\n"
           "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-          "sqrshl v21.4s, v21.4s, v28.4s\n"
-          "sqrshl v22.4s, v22.4s, v28.4s\n"
-          "sqrshl v23.4s, v23.4s, v28.4s\n"
-          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v21.4s, v21.4s, v30.4s\n"
+          "sqrshl v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v30.4s\n"
+          "sqrshl v24.4s, v24.4s, v31.4s\n"
 
           "sqxtn v21.4h, v21.4s\n"
           "sqxtn2 v21.8h, v22.4s\n"
@@ -574,10 +598,11 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
           "sqxtn2 v23.8h, v24.4s\n"
           "sqadd v21.8h, v21.8h, v29.8h\n"
           "sqadd v23.8h, v23.8h, v29.8h\n"
-          "sqxtun v21.8b, v21.8h\n"
-          "sqxtun2 v21.16b, v23.8h\n"
-          "umax v21.16b, v21.16b, v30.16b\n"
-          "umin v21.16b, v21.16b, v31.16b\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v24.16b, w0\n"
+          "smax v21.16b, v21.16b, v25.16b\n"
+          "smin v21.16b, v21.16b, v24.16b\n"
           "st1 {v21.8b}, [x6], x3\n"
           "mov v23.d[0], v21.d[1]\n"
           "st1 {v23.8b}, [x7], x3\n"
@@ -622,23 +647,24 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
           "smlal2 v24.4s, v8.8h, v20.8h\n"
 
           "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v28.4s\n"
           "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-          "sqrshl v21.4s, v21.4s, v28.4s\n"
-          "sqrshl v22.4s, v22.4s, v28.4s\n"
-          "sqrshl v23.4s, v23.4s, v28.4s\n"
-          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v21.4s, v21.4s, v30.4s\n"
+          "sqrshl v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v30.4s\n"
+          "sqrshl v24.4s, v24.4s, v31.4s\n"
           "sqxtn v21.4h, v21.4s\n"
           "sqxtn2 v21.8h, v22.4s\n"
           "sqxtn v23.4h, v23.4s\n"
           "sqxtn2 v23.8h, v24.4s\n"
           "sqadd v21.8h, v21.8h, v29.8h\n"
           "sqadd v23.8h, v23.8h, v29.8h\n"
-          "sqxtun v21.8b, v21.8h\n"
-          "sqxtun2 v21.16b, v23.8h\n"
-          "umax v21.16b, v21.16b, v30.16b\n"
-          "umin v21.16b, v21.16b, v31.16b\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v24.16b, w0\n"
+          "smax v21.16b, v21.16b, v25.16b\n"
+          "smin v21.16b, v21.16b, v24.16b\n"
           "st1 {v21.8b}, [x6], x3\n"
           "mov v23.d[0], v21.d[1]\n"
           "st1 {v23.8b}, [x7], x3\n"
@@ -681,15 +707,15 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
         "ld1 {v23.4s}, [%[bias_ptr]]\n"
         "ld1 {v24.4s}, [x10]\n"
 
-        "uaddw v9.8h, v26.8h, v9.8b\n"
-        "uaddw v10.8h, v26.8h, v10.8b\n"
-        "uaddw v11.8h, v26.8h, v11.8b\n"
-        "uaddw v13.8h, v26.8h, v13.8b\n"
-        "uaddw v14.8h, v26.8h, v14.8b\n"
-        "uaddw v15.8h, v26.8h, v15.8b\n"
-        "uaddw v17.8h, v26.8h, v17.8b\n"
-        "uaddw v18.8h, v26.8h, v18.8b\n"
-        "uaddw v19.8h, v26.8h, v19.8b\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+        "saddw v14.8h, v26.8h, v14.8b\n"
+        "saddw v15.8h, v26.8h, v15.8b\n"
+        "saddw v17.8h, v26.8h, v17.8b\n"
+        "saddw v18.8h, v26.8h, v18.8b\n"
+        "saddw v19.8h, v26.8h, v19.8b\n"
 
         "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
         "cmp w5, #1\n"
@@ -717,7 +743,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
           "ld1 {v9.8b}, [x12], %[input_depth]\n"
           "smlal v21.4s, v2.4h, v11.4h\n"
           "ld1 {v10.8b}, [x12], %[input_depth]\n"
-          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
           "smlal2 v22.4s, v2.8h, v11.8h\n"
           "ld1 {v11.8b}, [x12], %[input_depth]\n"
           "add x13, %[input_ptr], %[input_row_size]\n"
@@ -736,7 +762,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
           "smlal v23.4s, v4.4h, v15.4h\n"
           "smlal2 v24.4s, v4.8h, v15.8h\n"
           "smlal v21.4s, v5.4h, v15.4h\n"
-          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
           "smlal2 v22.4s, v5.8h, v15.8h\n"
           "ld1 {v15.8b}, [x13], %[input_depth]\n"
           "smlal v23.4s, v5.4h, v16.4h\n"
@@ -752,49 +778,50 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
           "smlal v23.4s, v7.4h, v19.4h\n"
           "smlal2 v24.4s, v7.8h, v19.8h\n"
           "smlal v21.4s, v8.4h, v19.4h\n"
-          "uaddw v20.8h, v26.8h, v20.8b\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
           "smlal2 v22.4s, v8.8h, v19.8h\n"
           "ld1 {v19.8b}, [x14], %[input_depth]\n"
           "smlal v23.4s, v8.4h, v20.4h\n"
           "smlal2 v24.4s, v8.8h, v20.8h\n"
 
           "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v28.4s\n"
           "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-          "sqrshl v21.4s, v21.4s, v28.4s\n"
-          "sqrshl v22.4s, v22.4s, v28.4s\n"
-          "sqrshl v23.4s, v23.4s, v28.4s\n"
-          "sqrshl v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrshl v21.4s, v21.4s, v30.4s\n"
+          "sqrshl v22.4s, v22.4s, v31.4s\n"
+          "sqrshl v23.4s, v23.4s, v30.4s\n"
+          "sqrshl v24.4s, v24.4s, v31.4s\n"
           "sqxtn v21.4h, v21.4s\n"
           "sqxtn2 v21.8h, v22.4s\n"
           "sqxtn v23.4h, v23.4s\n"
           "sqxtn2 v23.8h, v24.4s\n"
           "sqadd v21.8h, v21.8h, v29.8h\n"
           "sqadd v23.8h, v23.8h, v29.8h\n"
-          "sqxtun v21.8b, v21.8h\n"
-          "sqxtun2 v21.16b, v23.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v24.16b, w0\n"
           "ld1 {v22.4s}, [x10]\n"
-          "umax v21.16b, v21.16b, v30.16b\n"
-          "umin v21.16b, v21.16b, v31.16b\n"
+          "smax v21.16b, v21.16b, v25.16b\n"
+          "smin v21.16b, v21.16b, v24.16b\n"
           "ld1 {v24.4s}, [x10]\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
           "st1 {v21.8b}, [%[output_ptr]], x3\n"
-          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
           "mov v23.d[0], v21.d[1]\n"
           "st1 {v23.8b}, [%[output_ptr]], x3\n"
-          "uaddw v11.8h, v26.8h, v11.8b\n"
-          "uaddw v12.8h, v26.8h, v12.8b\n"
-          "uaddw v13.8h, v26.8h, v13.8b\n"
-          "uaddw v14.8h, v26.8h, v14.8b\n"
-          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
+          "saddw v14.8h, v26.8h, v14.8b\n"
+          "saddw v15.8h, v26.8h, v15.8b\n"
           "ld1 {v21.4s}, [%[bias_ptr]]\n"
-          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "saddw v16.8h, v26.8h, v16.8b\n"
           "ld1 {v23.4s}, [%[bias_ptr]]\n"
-          "uaddw v17.8h, v26.8h, v17.8b\n"
-          "uaddw v18.8h, v26.8h, v18.8b\n"
-          "uaddw v19.8h, v26.8h, v19.8b\n"
-          "uaddw v20.8h, v26.8h, v20.8b\n"
+          "saddw v17.8h, v26.8h, v17.8b\n"
+          "saddw v18.8h, v26.8h, v18.8b\n"
+          "saddw v19.8h, v26.8h, v19.8b\n"
+          "saddw v20.8h, v26.8h, v20.8b\n"
 
           "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
 
@@ -817,7 +844,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
         "smlal v23.4s, v1.4h, v11.4h\n"
         "smlal2 v24.4s, v1.8h, v11.8h\n"
         "smlal v21.4s, v2.4h, v11.4h\n"
-        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
         "smlal2 v22.4s, v2.8h, v11.8h\n"
         "smlal v23.4s, v2.4h, v12.4h\n"
         "smlal2 v24.4s, v2.8h, v12.8h\n"
@@ -830,7 +857,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
         "smlal v23.4s, v4.4h, v15.4h\n"
         "smlal2 v24.4s, v4.8h, v15.8h\n"
         "smlal v21.4s, v5.4h, v15.4h\n"
-        "uaddw v16.8h, v26.8h, v16.8b\n"
+        "saddw v16.8h, v26.8h, v16.8b\n"
         "smlal2 v22.4s, v5.8h, v15.8h\n"
         "smlal v23.4s, v5.4h, v16.4h\n"
         "smlal2 v24.4s, v5.8h, v16.8h\n"
@@ -843,29 +870,30 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
         "smlal v23.4s, v7.4h, v19.4h\n"
         "smlal2 v24.4s, v7.8h, v19.8h\n"
         "smlal v21.4s, v8.4h, v19.4h\n"
-        "uaddw v20.8h, v26.8h, v20.8b\n"
+        "saddw v20.8h, v26.8h, v20.8b\n"
         "smlal2 v22.4s, v8.8h, v19.8h\n"
         "smlal v23.4s, v8.4h, v20.4h\n"
         "smlal2 v24.4s, v8.8h, v20.8h\n"
 
         "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v28.4s\n"
         "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-        "sqrshl v21.4s, v21.4s, v28.4s\n"
-        "sqrshl v22.4s, v22.4s, v28.4s\n"
-        "sqrshl v23.4s, v23.4s, v28.4s\n"
-        "sqrshl v24.4s, v24.4s, v28.4s\n"
+        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+        "sqrshl v21.4s, v21.4s, v30.4s\n"
+        "sqrshl v22.4s, v22.4s, v31.4s\n"
+        "sqrshl v23.4s, v23.4s, v30.4s\n"
+        "sqrshl v24.4s, v24.4s, v31.4s\n"
         "sqxtn v21.4h, v21.4s\n"
         "sqxtn2 v21.8h, v22.4s\n"
         "sqxtn v23.4h, v23.4s\n"
         "sqxtn2 v23.8h, v24.4s\n"
         "sqadd v21.8h, v21.8h, v29.8h\n"
         "sqadd v23.8h, v23.8h, v29.8h\n"
-        "sqxtun v21.8b, v21.8h\n"
-        "sqxtun2 v21.16b, v23.8h\n"
-        "umax v21.16b, v21.16b, v30.16b\n"
-        "umin v21.16b, v21.16b, v31.16b\n"
+        "sqxtn v21.8b, v21.8h\n"
+        "sqxtn2 v21.16b, v23.8h\n"
+        "dup v24.16b, w0\n"
+        "smax v21.16b, v21.16b, v25.16b\n"
+        "smin v21.16b, v21.16b, v24.16b\n"
         "st1 {v21.8b}, [%[output_ptr]], x3\n"
         "mov v23.d[0], v21.d[1]\n"
         "st1 {v23.8b}, [%[output_ptr]], x3\n"
@@ -893,15 +921,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
         "smlal2 v22.4s, v8.8h, v19.8h\n"
 
         "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
-        "sqrshl v21.4s, v21.4s, v28.4s\n"
-        "sqrshl v22.4s, v22.4s, v28.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v28.4s\n"
+        "sqrshl v21.4s, v21.4s, v30.4s\n"
+        "sqrshl v22.4s, v22.4s, v31.4s\n"
         "sqxtn v21.4h, v21.4s\n"
         "sqxtn2 v21.8h, v22.4s\n"
         "sqadd v21.8h, v21.8h, v29.8h\n"
-        "sqxtun v21.8b, v21.8h\n"
-        "umax v21.8b, v21.8b, v30.8b\n"
-        "umin v21.8b, v21.8b, v31.8b\n"
+        "sqxtn v21.8b, v21.8h\n"
+        "dup v24.16b, w0\n"
+        "smax v21.8b, v21.8b, v25.8b\n"
+        "smin v21.8b, v21.8b, v24.8b\n"
         "st1 {v21.8b}, [%[output_ptr]]\n"
         DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
     :
@@ -911,6 +940,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
     [output_window_height] "+r"(output_window_height)
     :
     // Inputs.
+    [output_multiplier_ptr] "r"(output_multiplier_ptr),
+    [output_shift_ptr] "r"(output_shift_ptr),
     [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
     [input_depth] "r"(input_depth),
     [output_window_width] "r"(output_window_width),
@@ -946,10 +977,12 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
 template <>
 struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
                                      2> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
-                         int64_t input_depth, int64_t input_row_size,
-                         int32 output_window_height, int32 output_window_width,
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr, int64_t input_depth,
+                         int64_t input_row_size, int32 output_window_height,
+                         int32 output_window_width,
                          const DepthwiseConvParams* params_ptr) {
     const int64_t input_width_increment = 4 * input_depth;
     const int64_t input_height_increment = 4 * input_row_size;
@@ -998,49 +1031,65 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
         // interleaved with arithmetic operations to take advantage of
         // dual-issue pipelines. We also add input offsets as far from the loads
         // as possible to give loads enough cycles to fetch data from memory.
+        //
+        // This logic is copied and modified from the non-per-channel quantized
+        // part.
+        // The register planning here is really tricky:
+        // v0-v29 are all used at least once for either filter/input/output,
+        // some of them are used for output shift and output mulitplier, or
+        // input/output offset.
+        // Only v30 & v31 are only used for output activation min/max.
+        // For per-channel case, we need 4 registers to hold output shift &
+        // output multiplier. However, given the reality, we simply cannot do
+        // that without reloading.
+        //
+        // So here's the plan:
+        // We hold output_multiplier in v30 & v31, and we will load output_shift
+        // into two consecutive registers each time before use.
+        // We will duplicate output min & max before needed.
+        // Sometimes we may borrow registers from input offset or bias, we will
+        // dup them back after use.
+        //
 
         // Set "constant" registers. These registers may be replaced with temp
         // values from time to time when there are not enough NEON registers.
         // We use x9--x15 general purpose registers as they are caller-saved
         // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
-        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
         "cmp %w[output_window_height], #2\n"
         "dup v28.8h, w0\n"
-        "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
-        "dup v26.4s, w9\n"
         "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
-        "dup v27.4s, w1\n"
         "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
         "dup v29.8h, w2\n"
         "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
-        "dup v30.16b, w3\n"
         "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
-        "dup v31.16b, w4\n"
         "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
         "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
 
+        // Deal with output multiplier.
+        "ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
+
         // Load filters and add offsets.
         "add x10, %[bias_ptr], #16\n"
         "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
         "dup v9.8h, w20\n"
         "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
-        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "saddw v0.8h, v9.8h, v0.8b\n"
         "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
-        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "saddw v1.8h, v9.8h, v1.8b\n"
         "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
-        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "saddw v2.8h, v9.8h, v2.8b\n"
         "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
-        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "saddw v3.8h, v9.8h, v3.8b\n"
         "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
-        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "saddw v4.8h, v9.8h, v4.8b\n"
         "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
-        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "saddw v5.8h, v9.8h, v5.8b\n"
         "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
-        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "saddw v6.8h, v9.8h, v6.8b\n"
         "ld1 {v8.8b}, [%[filter_ptr]]\n"
-        "uaddw v7.8h, v9.8h, v7.8b\n"
-        "uaddw v8.8h, v9.8h, v8.8b\n"
+        "saddw v7.8h, v9.8h, v7.8b\n"
+        "saddw v8.8h, v9.8h, v8.8b\n"
 
         "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
 
@@ -1070,17 +1119,17 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "ld1 {v21.4s}, [%[bias_ptr]]\n"
           "ld1 {v22.4s}, [x10]\n"
           "ld1 {v23.4s}, [%[bias_ptr]]\n"
-          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
           "ld1 {v24.4s}, [x10]\n"
-          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
           "ld1 {v19.4s}, [%[bias_ptr]]\n"
-          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
           "ld1 {v20.4s}, [x10]\n"
-          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
           "ld1 {v25.4s}, [%[bias_ptr]]\n"
-          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
           "ld1 {v26.4s}, [x10]\n"
-          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
 
           "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
           "cmp w14, #1\n"
@@ -1113,41 +1162,41 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "smlal2 v24.4s, v3.8h, v16.8h\n"
             "cmp w14, #3\n"
             "smlal v21.4s, v4.4h, v15.4h\n"
-            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
             "smlal2 v22.4s, v4.8h, v15.8h\n"
             "ld1 {v15.8b}, [x12], %[input_depth]\n"
             "smlal v21.4s, v5.4h, v16.4h\n"
-            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
             "smlal2 v22.4s, v5.8h, v16.8h\n"
             "ld1 {v16.8b}, [x12], %[input_depth]\n"
             "smlal v23.4s, v1.4h, v12.4h\n"
-            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "saddw v17.8h, v28.8h, v17.8b\n"
             "smlal2 v24.4s, v1.8h, v12.8h\n"
             "ld1 {v12.8b}, [x15], %[input_depth]\n"
             "smlal v23.4s, v2.4h, v13.4h\n"
-            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "saddw v18.8h, v28.8h, v18.8b\n"
             "smlal2 v24.4s, v2.8h, v13.8h\n"
             "ld1 {v13.8b}, [x15]\n"
             "smlal v23.4s, v4.4h, v17.4h\n"
-            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
             "smlal2 v24.4s, v4.8h, v17.8h\n"
             "ld1 {v17.8b}, [x12], %[input_depth]\n"
             "smlal v23.4s, v5.4h, v18.4h\n"
-            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
             "smlal2 v24.4s, v5.8h, v18.8h\n"
             "ld1 {v18.8b}, [x12]\n"
 
             "smlal v21.4s, v6.4h, v9.4h\n"
             "smlal2 v22.4s, v6.8h, v9.8h\n"
             "smlal v19.4s, v0.4h, v9.4h\n"
-            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
             "smlal2 v20.4s, v0.8h, v9.8h\n"
             "ld1 {v9.8b}, [x13], %[input_depth]\n"
             "smlal v23.4s, v6.4h, v11.4h\n"
             "smlal2 v24.4s, v6.8h, v11.8h\n"
             "smlal v21.4s, v7.4h, v10.4h\n"
             "smlal2 v22.4s, v7.8h, v10.8h\n"
-            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
             "smlal v19.4s, v1.4h, v10.4h\n"
             "smlal2 v20.4s, v1.8h, v10.8h\n"
             "ld1 {v10.8b}, [x13], %[input_depth]\n"
@@ -1161,7 +1210,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "smlal v19.4s, v2.4h, v11.4h\n"
             "mov x12, x11\n"
             "smlal2 v20.4s, v2.8h, v11.8h\n"
-            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
             "smlal v25.4s, v0.4h, v11.4h\n"
             "smlal2 v26.4s, v0.8h, v11.8h\n"
             "ld1 {v11.8b}, [x13], %[input_depth]\n"
@@ -1174,14 +1223,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "add x13, x12, %[input_row_size]\n"
             "add x15, x13, %[input_row_size]\n"
 
-            "dup v28.4s, w9\n"
-            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
-            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+            "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+            "sqrshl v21.4s, v21.4s, v27.4s\n"
             "sqrshl v22.4s, v22.4s, v28.4s\n"
-            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v27.4s\n"
             "sqrshl v24.4s, v24.4s, v28.4s\n"
             "dup v28.8h, w0\n"
             "sqxtn v21.4h, v21.4s\n"
@@ -1190,18 +1239,21 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "sqxtn2 v23.8h, v24.4s\n"
             "sqadd v21.8h, v21.8h, v29.8h\n"
             "sqadd v23.8h, v23.8h, v29.8h\n"
-            "sqxtun v21.8b, v21.8h\n"
-            "sqxtun2 v21.16b, v23.8h\n"
+            "sqxtn v21.8b, v21.8h\n"
+            "sqxtn2 v21.16b, v23.8h\n"
+            "dup v27.16b, w3\n"
+            "dup v29.16b, w4\n"
             "ld1 {v22.4s}, [x10]\n"
-            "umax v21.16b, v21.16b, v30.16b\n"
-            "umin v21.16b, v21.16b, v31.16b\n"
+            "smax v21.16b, v21.16b, v27.16b\n"
+            "smin v21.16b, v21.16b, v29.16b\n"
             "ld1 {v24.4s}, [x10]\n"
-            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "dup v29.8h, w2\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
             "st1 {v21.8b}, [x6], x5\n"
-            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
             "mov v23.d[0], v21.d[1]\n"
             "st1 {v23.8b}, [x6], x5\n"
-            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
 
             "smlal v19.4s, v6.4h, v9.4h\n"
             "smlal2 v20.4s, v6.8h, v9.8h\n"
@@ -1209,21 +1261,21 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "smlal v25.4s, v6.4h, v11.4h\n"
             "smlal2 v26.4s, v6.8h, v11.8h\n"
             "smlal v19.4s, v7.4h, v10.4h\n"
-            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "saddw v12.8h, v28.8h, v12.8b\n"
             "smlal2 v20.4s, v7.8h, v10.8h\n"
             "ld1 {v10.8b}, [x12], %[input_depth]\n"
             "smlal v25.4s, v7.4h, v12.4h\n"
             "smlal2 v26.4s, v7.8h, v12.8h\n"
             "smlal v19.4s, v8.4h, v11.4h\n"
-            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "saddw v13.8h, v28.8h, v13.8b\n"
             "smlal2 v20.4s, v8.8h, v11.8h\n"
             "ld1 {v11.8b}, [x12], %[input_depth]\n"
             "smlal v25.4s, v8.4h, v13.4h\n"
-            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "saddw v14.8h, v28.8h, v14.8b\n"
             "smlal2 v26.4s, v8.8h, v13.8h\n"
-            "uaddw v16.8h, v28.8h, v16.8b\n"
+            "saddw v16.8h, v28.8h, v16.8b\n"
             "smlal v19.4s, v3.4h, v14.4h\n"
-            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "saddw v15.8h, v28.8h, v15.8b\n"
             "smlal2 v20.4s, v3.8h, v14.8h\n"
             "ld1 {v14.8b}, [x13], %[input_depth]\n"
             "smlal v25.4s, v3.4h, v16.4h\n"
@@ -1231,26 +1283,26 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "smlal2 v26.4s, v3.8h, v16.8h\n"
             "ld1 {v23.4s}, [%[bias_ptr]]\n"
             "smlal v19.4s, v4.4h, v15.4h\n"
-            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "saddw v17.8h, v28.8h, v17.8b\n"
             "smlal2 v20.4s, v4.8h, v15.8h\n"
             "ld1 {v15.8b}, [x13], %[input_depth]\n"
             "smlal v25.4s, v4.4h, v17.4h\n"
             "smlal2 v26.4s, v4.8h, v17.8h\n"
             "smlal v19.4s, v5.4h, v16.4h\n"
-            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "saddw v18.8h, v28.8h, v18.8b\n"
             "smlal2 v20.4s, v5.8h, v16.8h\n"
             "ld1 {v16.8b}, [x13], %[input_depth]\n"
             "smlal v25.4s, v5.4h, v18.4h\n"
             "smlal2 v26.4s, v5.8h, v18.8h\n"
 
-            "dup v28.4s, w9\n"
-            "sqrdmulh v19.4s, v19.4s, v27.4s\n"
-            "sqrdmulh v20.4s, v20.4s, v27.4s\n"
-            "sqrdmulh v25.4s, v25.4s, v27.4s\n"
-            "sqrdmulh v26.4s, v26.4s, v27.4s\n"
-            "sqrshl v19.4s, v19.4s, v28.4s\n"
+            "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+            "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v30.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v31.4s\n"
+            "sqrshl v19.4s, v19.4s, v27.4s\n"
             "sqrshl v20.4s, v20.4s, v28.4s\n"
-            "sqrshl v25.4s, v25.4s, v28.4s\n"
+            "sqrshl v25.4s, v25.4s, v27.4s\n"
             "sqrshl v26.4s, v26.4s, v28.4s\n"
             "dup v28.8h, w0\n"
             "sqxtn v19.4h, v19.4s\n"
@@ -1259,23 +1311,26 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "sqxtn2 v25.8h, v26.4s\n"
             "sqadd v19.8h, v19.8h, v29.8h\n"
             "sqadd v25.8h, v25.8h, v29.8h\n"
-            "sqxtun v19.8b, v19.8h\n"
-            "sqxtun2 v19.16b, v25.8h\n"
+            "sqxtn v19.8b, v19.8h\n"
+            "sqxtn2 v19.16b, v25.8h\n"
+            "dup v27.16b, w3\n"
+            "dup v29.16b, w4\n"
             "ld1 {v20.4s}, [x10]\n"
-            "umax v19.16b, v19.16b, v30.16b\n"
-            "umin v19.16b, v19.16b, v31.16b\n"
+            "smax v19.16b, v19.16b, v27.16b\n"
+            "smin v19.16b, v19.16b, v29.16b\n"
             "ld1 {v26.4s}, [x10]\n"
-            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "dup v29.8h, w2\n"
+            "saddw v9.8h, v28.8h, v9.8b\n"
             "st1 {v19.8b}, [x7], x5\n"
-            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "saddw v10.8h, v28.8h, v10.8b\n"
             "mov v25.d[0], v19.d[1]\n"
             "st1 {v25.8b}, [x7], x5\n"
-            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "saddw v11.8h, v28.8h, v11.8b\n"
             "ld1 {v19.4s}, [%[bias_ptr]]\n"
-            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "saddw v14.8h, v28.8h, v14.8b\n"
             "ld1 {v25.4s}, [%[bias_ptr]]\n"
-            "uaddw v15.8h, v28.8h, v15.8b\n"
-            "uaddw v16.8h, v28.8h, v16.8b\n"
+            "saddw v15.8h, v28.8h, v15.8b\n"
+            "saddw v16.8h, v28.8h, v16.8b\n"
 
             "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
 
@@ -1309,41 +1364,41 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal v23.4s, v3.4h, v16.4h\n"
           "smlal2 v24.4s, v3.8h, v16.8h\n"
           "smlal v21.4s, v4.4h, v15.4h\n"
-          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
           "smlal2 v22.4s, v4.8h, v15.8h\n"
           "ld1 {v15.8b}, [x12], %[input_depth]\n"
           "smlal v21.4s, v5.4h, v16.4h\n"
-          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
           "smlal2 v22.4s, v5.8h, v16.8h\n"
           "ld1 {v16.8b}, [x12], %[input_depth]\n"
           "smlal v23.4s, v1.4h, v12.4h\n"
-          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
           "smlal2 v24.4s, v1.8h, v12.8h\n"
           "ld1 {v12.8b}, [x15], %[input_depth]\n"
           "smlal v23.4s, v2.4h, v13.4h\n"
-          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
           "smlal2 v24.4s, v2.8h, v13.8h\n"
           "ld1 {v13.8b}, [x15]\n"
           "smlal v23.4s, v4.4h, v17.4h\n"
-          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
           "smlal2 v24.4s, v4.8h, v17.8h\n"
           "ld1 {v17.8b}, [x12], %[input_depth]\n"
           "smlal v23.4s, v5.4h, v18.4h\n"
-          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
           "smlal2 v24.4s, v5.8h, v18.8h\n"
           "ld1 {v18.8b}, [x12]\n"
 
           "smlal v21.4s, v6.4h, v9.4h\n"
           "smlal2 v22.4s, v6.8h, v9.8h\n"
           "smlal v19.4s, v0.4h, v9.4h\n"
-          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
           "smlal2 v20.4s, v0.8h, v9.8h\n"
           "ld1 {v9.8b}, [x13], %[input_depth]\n"
           "smlal v23.4s, v6.4h, v11.4h\n"
           "smlal2 v24.4s, v6.8h, v11.8h\n"
           "smlal v21.4s, v7.4h, v10.4h\n"
           "smlal2 v22.4s, v7.8h, v10.8h\n"
-          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
           "smlal v19.4s, v1.4h, v10.4h\n"
           "smlal2 v20.4s, v1.8h, v10.8h\n"
           "ld1 {v10.8b}, [x13], %[input_depth]\n"
@@ -1355,7 +1410,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal2 v22.4s, v8.8h, v11.8h\n"
           "smlal v19.4s, v2.4h, v11.4h\n"
           "smlal2 v20.4s, v2.8h, v11.8h\n"
-          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
           "smlal v25.4s, v0.4h, v11.4h\n"
           "smlal2 v26.4s, v0.8h, v11.8h\n"
           "ld1 {v11.8b}, [x13], %[input_depth]\n"
@@ -1366,14 +1421,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal2 v26.4s, v2.8h, v13.8h\n"
           "ld1 {v13.8b}, [x13]\n"
 
-          "dup v28.4s, w9\n"
-          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
-          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+          "sqrshl v21.4s, v21.4s, v27.4s\n"
           "sqrshl v22.4s, v22.4s, v28.4s\n"
-          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v27.4s\n"
           "sqrshl v24.4s, v24.4s, v28.4s\n"
           "dup v28.8h, w0\n"
           "sqxtn v21.4h, v21.4s\n"
@@ -1382,59 +1437,62 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "sqxtn2 v23.8h, v24.4s\n"
           "sqadd v21.8h, v21.8h, v29.8h\n"
           "sqadd v23.8h, v23.8h, v29.8h\n"
-          "sqxtun v21.8b, v21.8h\n"
-          "sqxtun2 v21.16b, v23.8h\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
           "ld1 {v22.4s}, [x10]\n"
-          "umax v21.16b, v21.16b, v30.16b\n"
-          "umin v21.16b, v21.16b, v31.16b\n"
+          "smax v21.16b, v21.16b, v27.16b\n"
+          "smin v21.16b, v21.16b, v29.16b\n"
           "ld1 {v24.4s}, [x10]\n"
-          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "dup v29.8h, w2\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
           "st1 {v21.8b}, [x6], x5\n"
-          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
           "mov v23.d[0], v21.d[1]\n"
           "st1 {v23.8b}, [x6]\n"
-          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
 
           "smlal v19.4s, v6.4h, v9.4h\n"
           "smlal2 v20.4s, v6.8h, v9.8h\n"
           "smlal v25.4s, v6.4h, v11.4h\n"
           "smlal2 v26.4s, v6.8h, v11.8h\n"
           "smlal v19.4s, v7.4h, v10.4h\n"
-          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
           "smlal2 v20.4s, v7.8h, v10.8h\n"
           "smlal v25.4s, v7.4h, v12.4h\n"
           "smlal2 v26.4s, v7.8h, v12.8h\n"
           "smlal v19.4s, v8.4h, v11.4h\n"
-          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
           "smlal2 v20.4s, v8.8h, v11.8h\n"
           "smlal v25.4s, v8.4h, v13.4h\n"
-          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
           "smlal2 v26.4s, v8.8h, v13.8h\n"
-          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
           "smlal v19.4s, v3.4h, v14.4h\n"
-          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
           "smlal2 v20.4s, v3.8h, v14.8h\n"
           "smlal v25.4s, v3.4h, v16.4h\n"
           "smlal2 v26.4s, v3.8h, v16.8h\n"
           "smlal v19.4s, v4.4h, v15.4h\n"
-          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
           "smlal2 v20.4s, v4.8h, v15.8h\n"
           "smlal v25.4s, v4.4h, v17.4h\n"
           "smlal2 v26.4s, v4.8h, v17.8h\n"
           "smlal v19.4s, v5.4h, v16.4h\n"
-          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
           "smlal2 v20.4s, v5.8h, v16.8h\n"
           "smlal v25.4s, v5.4h, v18.4h\n"
           "smlal2 v26.4s, v5.8h, v18.8h\n"
 
-          "dup v28.4s, w9\n"
-          "sqrdmulh v19.4s, v19.4s, v27.4s\n"
-          "sqrdmulh v20.4s, v20.4s, v27.4s\n"
-          "sqrdmulh v25.4s, v25.4s, v27.4s\n"
-          "sqrdmulh v26.4s, v26.4s, v27.4s\n"
-          "sqrshl v19.4s, v19.4s, v28.4s\n"
+          "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v30.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v31.4s\n"
+          "sqrshl v19.4s, v19.4s, v27.4s\n"
           "sqrshl v20.4s, v20.4s, v28.4s\n"
-          "sqrshl v25.4s, v25.4s, v28.4s\n"
+          "sqrshl v25.4s, v25.4s, v27.4s\n"
           "sqrshl v26.4s, v26.4s, v28.4s\n"
           "dup v28.8h, w0\n"
           "sqxtn v19.4h, v19.4s\n"
@@ -1443,11 +1501,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "sqxtn2 v25.8h, v26.4s\n"
           "sqadd v19.8h, v19.8h, v29.8h\n"
           "sqadd v25.8h, v25.8h, v29.8h\n"
-          "sqxtun v19.8b, v19.8h\n"
-          "sqxtun2 v19.16b, v25.8h\n"
-          "umax v19.16b, v19.16b, v30.16b\n"
-          "umin v19.16b, v19.16b, v31.16b\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
+          "sqxtn v19.8b, v19.8h\n"
+          "sqxtn2 v19.16b, v25.8h\n"
+          "smax v19.16b, v19.16b, v27.16b\n"
+          "smin v19.16b, v19.16b, v29.16b\n"
           "st1 {v19.8b}, [x7], x5\n"
+          "dup v29.8h, w2\n"
           "mov v25.d[0], v19.d[1]\n"
           "st1 {v25.8b}, [x7]\n"
           "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
@@ -1481,15 +1542,15 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal2 v22.4s, v4.8h, v15.8h\n"
           "ld1 {v15.8b}, [x13], %[input_depth]\n"
           "smlal v21.4s, v5.4h, v16.4h\n"
-          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
           "smlal2 v22.4s, v5.8h, v16.8h\n"
-          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
           "ld1 {v16.8b}, [x13]\n"
 
           "smlal v21.4s, v6.4h, v12.4h\n"
           "smlal2 v22.4s, v6.8h, v12.8h\n"
           "smlal v23.4s, v0.4h, v12.4h\n"
-          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
           "smlal2 v24.4s, v0.8h, v12.8h\n"
           "smlal v21.4s, v7.4h, v13.4h\n"
           "smlal2 v22.4s, v7.8h, v13.8h\n"
@@ -1500,29 +1561,31 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal v23.4s, v2.4h, v17.4h\n"
           "smlal2 v24.4s, v2.8h, v17.8h\n"
 
-          "dup v26.4s, w9\n"
-          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v31.4s\n"
           "sqrshl v21.4s, v21.4s, v26.4s\n"
-          "sqrshl v22.4s, v22.4s, v26.4s\n"
+          "sqrshl v22.4s, v22.4s, v27.4s\n"
           "sqxtn v21.4h, v21.4s\n"
           "sqxtn2 v21.8h, v22.4s\n"
+          "dup v26.16b, w3\n"
+          "dup v27.16b, w4\n"
           "sqadd v21.8h, v21.8h, v29.8h\n"
-          "sqxtun v21.8b, v21.8h\n"
-          "umax v21.8b, v21.8b, v30.8b\n"
-          "umin v21.8b, v21.8b, v31.8b\n"
-          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "sqxtn v21.8b, v21.8h\n"
+          "smax v21.8b, v21.8b, v26.8b\n"
+          "smin v21.8b, v21.8b, v27.8b\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
           "st1 {v21.8b}, [x6]\n"
-          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
 
           "smlal v23.4s, v3.4h, v9.4h\n"
-          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
           "smlal2 v24.4s, v3.8h, v9.8h\n"
-          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
           "smlal v23.4s, v4.4h, v10.4h\n"
-          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
           "smlal2 v24.4s, v4.8h, v10.8h\n"
-          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
           "smlal v23.4s, v5.4h, v11.4h\n"
           "smlal2 v24.4s, v5.8h, v11.8h\n"
 
@@ -1533,16 +1596,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal v23.4s, v8.4h, v16.4h\n"
           "smlal2 v24.4s, v8.8h, v16.8h\n"
 
-          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v31.4s\n"
           "sqrshl v23.4s, v23.4s, v26.4s\n"
-          "sqrshl v24.4s, v24.4s, v26.4s\n"
+          "sqrshl v24.4s, v24.4s, v27.4s\n"
           "sqxtn v23.4h, v23.4s\n"
           "sqxtn2 v23.8h, v24.4s\n"
+          "dup v26.16b, w3\n"
+          "dup v27.16b, w4\n"
           "sqadd v23.8h, v23.8h, v29.8h\n"
-          "sqxtun v23.8b, v23.8h\n"
-          "umax v23.8b, v23.8b, v30.8b\n"
-          "umin v23.8b, v23.8b, v31.8b\n"
+          "sqxtn v23.8b, v23.8h\n"
+          "smax v23.8b, v23.8b, v26.8b\n"
+          "smin v23.8b, v23.8b, v27.8b\n"
           "st1 {v23.8b}, [x7]\n"
 
           DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
@@ -1578,19 +1644,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
         "ld1 {v16.8b}, [x15], %[input_depth]\n"
         "ld1 {v17.8b}, [x15], %[input_depth]\n"
 
-        "uaddw v9.8h, v28.8h, v9.8b\n"
+        "saddw v9.8h, v28.8h, v9.8b\n"
         "ld1 {v24.4s}, [%[bias_ptr]]\n"
-        "uaddw v10.8h, v28.8h, v10.8b\n"
+        "saddw v10.8h, v28.8h, v10.8b\n"
         "ld1 {v25.4s}, [x10]\n"
-        "uaddw v11.8h, v28.8h, v11.8b\n"
+        "saddw v11.8h, v28.8h, v11.8b\n"
         "ld1 {v26.4s}, [%[bias_ptr]]\n"
         "ld1 {v27.4s}, [x10]\n"
-        "uaddw v12.8h, v28.8h, v12.8b\n"
-        "uaddw v13.8h, v28.8h, v13.8b\n"
-        "uaddw v14.8h, v28.8h, v14.8b\n"
-        "uaddw v15.8h, v28.8h, v15.8b\n"
-        "uaddw v16.8h, v28.8h, v16.8b\n"
-        "uaddw v17.8h, v28.8h, v17.8b\n"
+        "saddw v12.8h, v28.8h, v12.8b\n"
+        "saddw v13.8h, v28.8h, v13.8b\n"
+        "saddw v14.8h, v28.8h, v14.8b\n"
+        "saddw v15.8h, v28.8h, v15.8b\n"
+        "saddw v16.8h, v28.8h, v16.8b\n"
+        "saddw v17.8h, v28.8h, v17.8b\n"
 
         "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
         "cmp w14, #1\n"
@@ -1641,66 +1707,67 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal2 v25.4s, v7.8h, v16.8h\n"
           "ld1 {v16.8b}, [x15], %[input_depth]\n"
           "smlal v24.4s, v8.4h, v17.4h\n"
-          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "saddw v18.8h, v28.8h, v18.8b\n"
           "smlal2 v25.4s, v8.8h, v17.8h\n"
           "ld1 {v17.8b}, [x15], %[input_depth]\n"
-          "uaddw v19.8h, v28.8h, v19.8b\n"
+          "saddw v19.8h, v28.8h, v19.8b\n"
 
           "smlal v26.4s, v1.4h, v18.4h\n"
-          "uaddw v20.8h, v28.8h, v20.8b\n"
+          "saddw v20.8h, v28.8h, v20.8b\n"
           "smlal2 v27.4s, v1.8h, v18.8h\n"
           "smlal v26.4s, v2.4h, v19.4h\n"
-          "uaddw v21.8h, v28.8h, v21.8b\n"
+          "saddw v21.8h, v28.8h, v21.8b\n"
           "smlal2 v27.4s, v2.8h, v19.8h\n"
           "smlal v26.4s, v4.4h, v20.4h\n"
           "smlal v26.4s, v5.4h, v21.4h\n"
           "smlal2 v27.4s, v4.8h, v20.8h\n"
-          "uaddw v22.8h, v28.8h, v22.8b\n"
+          "saddw v22.8h, v28.8h, v22.8b\n"
           "smlal2 v27.4s, v5.8h, v21.8h\n"
-          "uaddw v23.8h, v28.8h, v23.8b\n"
+          "saddw v23.8h, v28.8h, v23.8b\n"
           "smlal v26.4s, v7.4h, v22.4h\n"
           "smlal2 v27.4s, v7.8h, v22.8h\n"
           "smlal v26.4s, v8.4h, v23.4h\n"
           "smlal2 v27.4s, v8.8h, v23.8h\n"
 
-          "dup v28.4s, w1\n"
-          "dup v29.4s, w9\n"
-          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
-          "sqrdmulh v25.4s, v25.4s, v28.4s\n"
-          "sqrdmulh v26.4s, v26.4s, v28.4s\n"
-          "sqrdmulh v27.4s, v27.4s, v28.4s\n"
-          "dup v28.8h, w2\n"
-          "sqrshl v24.4s, v24.4s, v29.4s\n"
+          "ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v30.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
           "sqrshl v25.4s, v25.4s, v29.4s\n"
-          "sqrshl v26.4s, v26.4s, v29.4s\n"
+          "sqrshl v26.4s, v26.4s, v28.4s\n"
           "sqrshl v27.4s, v27.4s, v29.4s\n"
+          "dup v28.8h, w2\n"
           "sqxtn v24.4h, v24.4s\n"
           "sqxtn2 v24.8h, v25.4s\n"
           "sqxtn v26.4h, v26.4s\n"
           "sqxtn2 v26.8h, v27.4s\n"
           "sqadd v24.8h, v24.8h, v28.8h\n"
           "sqadd v26.8h, v26.8h, v28.8h\n"
-          "sqxtun v24.8b, v24.8h\n"
-          "sqxtun2 v24.16b, v26.8h\n"
+          "sqxtn v24.8b, v24.8h\n"
+          "sqxtn2 v24.16b, v26.8h\n"
           "dup v28.8h, w0\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
           "ld1 {v25.4s}, [x10]\n"
-          "umax v24.16b, v24.16b, v30.16b\n"
-          "umin v24.16b, v24.16b, v31.16b\n"
-          "ld1 {v27.4s}, [x10]\n"
-          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "smax v24.16b, v24.16b, v27.16b\n"
+          "smin v24.16b, v24.16b, v29.16b\n"
+          "saddw v9.8h, v28.8h, v9.8b\n"
           "st1 {v24.8b}, [x6], x5\n"
-          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "ld1 {v27.4s}, [x10]\n"
+          "saddw v10.8h, v28.8h, v10.8b\n"
           "mov v26.d[0], v24.d[1]\n"
           "st1 {v26.8b}, [x6], x5\n"
-          "uaddw v11.8h, v28.8h, v11.8b\n"
-          "uaddw v12.8h, v28.8h, v12.8b\n"
-          "uaddw v13.8h, v28.8h, v13.8b\n"
-          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "saddw v11.8h, v28.8h, v11.8b\n"
+          "saddw v12.8h, v28.8h, v12.8b\n"
+          "saddw v13.8h, v28.8h, v13.8b\n"
+          "saddw v14.8h, v28.8h, v14.8b\n"
           "ld1 {v24.4s}, [%[bias_ptr]]\n"
-          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "saddw v15.8h, v28.8h, v15.8b\n"
           "ld1 {v26.4s}, [%[bias_ptr]]\n"
-          "uaddw v16.8h, v28.8h, v16.8b\n"
-          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "saddw v16.8h, v28.8h, v16.8b\n"
+          "saddw v17.8h, v28.8h, v17.8b\n"
 
           "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
 
@@ -1740,58 +1807,57 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
         "smlal v24.4s, v7.4h, v16.4h\n"
         "smlal2 v25.4s, v7.8h, v16.8h\n"
         "smlal v24.4s, v8.4h, v17.4h\n"
-        "uaddw v18.8h, v28.8h, v18.8b\n"
+        "saddw v18.8h, v28.8h, v18.8b\n"
         "smlal2 v25.4s, v8.8h, v17.8h\n"
-        "uaddw v19.8h, v28.8h, v19.8b\n"
+        "saddw v19.8h, v28.8h, v19.8b\n"
 
         "smlal v26.4s, v1.4h, v18.4h\n"
-        "uaddw v20.8h, v28.8h, v20.8b\n"
+        "saddw v20.8h, v28.8h, v20.8b\n"
         "smlal2 v27.4s, v1.8h, v18.8h\n"
         "smlal v26.4s, v2.4h, v19.4h\n"
-        "uaddw v21.8h, v28.8h, v21.8b\n"
+        "saddw v21.8h, v28.8h, v21.8b\n"
         "smlal2 v27.4s, v2.8h, v19.8h\n"
         "smlal v26.4s, v4.4h, v20.4h\n"
         "smlal v26.4s, v5.4h, v21.4h\n"
         "smlal2 v27.4s, v4.8h, v20.8h\n"
-        "uaddw v22.8h, v28.8h, v22.8b\n"
+        "saddw v22.8h, v28.8h, v22.8b\n"
         "smlal2 v27.4s, v5.8h, v21.8h\n"
-        "uaddw v23.8h, v28.8h, v23.8b\n"
+        "saddw v23.8h, v28.8h, v23.8b\n"
         "smlal v26.4s, v7.4h, v22.4h\n"
         "smlal2 v27.4s, v7.8h, v22.8h\n"
         "smlal v26.4s, v8.4h, v23.4h\n"
         "smlal2 v27.4s, v8.8h, v23.8h\n"
 
-        "dup v28.4s, w1\n"
-        "dup v29.4s, w9\n"
-        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
-        "sqrdmulh v25.4s, v25.4s, v28.4s\n"
-        "sqrdmulh v26.4s, v26.4s, v28.4s\n"
-        "sqrdmulh v27.4s, v27.4s, v28.4s\n"
-        "dup v28.8h, w2\n"
-        "sqrshl v24.4s, v24.4s, v29.4s\n"
+        "ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
+        "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v30.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+        "sqrshl v24.4s, v24.4s, v28.4s\n"
         "sqrshl v25.4s, v25.4s, v29.4s\n"
-        "sqrshl v26.4s, v26.4s, v29.4s\n"
+        "sqrshl v26.4s, v26.4s, v28.4s\n"
         "sqrshl v27.4s, v27.4s, v29.4s\n"
+        "dup v28.8h, w2\n"
         "sqxtn v24.4h, v24.4s\n"
         "sqxtn2 v24.8h, v25.4s\n"
         "sqxtn v26.4h, v26.4s\n"
         "sqxtn2 v26.8h, v27.4s\n"
         "sqadd v24.8h, v24.8h, v28.8h\n"
         "sqadd v26.8h, v26.8h, v28.8h\n"
-        "sqxtun v24.8b, v24.8h\n"
-        "sqxtun2 v24.16b, v26.8h\n"
-        "dup v28.8h, w0\n"
-        "umax v24.16b, v24.16b, v30.16b\n"
-        "umin v24.16b, v24.16b, v31.16b\n"
+        "sqxtn v24.8b, v24.8h\n"
+        "dup v28.16b, w3\n"
+        "dup v29.16b, w4\n"
+        "sqxtn2 v24.16b, v26.8h\n"
+        "smax v24.16b, v24.16b, v28.16b\n"
+        "smin v24.16b, v24.16b, v29.16b\n"
         "st1 {v24.8b}, [x6], x5\n"
         "mov v26.d[0], v24.d[1]\n"
         "st1 {v26.8b}, [x6]\n"
+        "dup v28.8h, w0\n"
         "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
 
         // Handle bottom right output if exists.
         DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
-        "dup v26.4s, w9\n"
-        "dup v27.4s, w1\n"
         "dup v29.8h, w2\n"
 
         "smlal v24.4s, v0.4h, v9.4h\n"
@@ -1813,16 +1879,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
         "smlal v24.4s, v8.4h, v17.4h\n"
         "smlal2 v25.4s, v8.8h, v17.8h\n"
 
-        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-        "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+        "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+        "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v31.4s\n"
         "sqrshl v24.4s, v24.4s, v26.4s\n"
-        "sqrshl v25.4s, v25.4s, v26.4s\n"
+        "sqrshl v25.4s, v25.4s, v27.4s\n"
         "sqxtn v24.4h, v24.4s\n"
         "sqxtn2 v24.8h, v25.4s\n"
+        "dup v26.16b, w3\n"
+        "dup v27.16b, w4\n"
         "sqadd v24.8h, v24.8h, v29.8h\n"
-        "sqxtun v24.8b, v24.8h\n"
-        "umax v24.8b, v24.8b, v30.8b\n"
-        "umin v24.8b, v24.8b, v31.8b\n"
+        "sqxtn v24.8b, v24.8h\n"
+        "smax v24.8b, v24.8b, v26.8b\n"
+        "smin v24.8b, v24.8b, v27.8b\n"
         "st1 {v24.8b}, [x6]\n"
 
         DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
@@ -1833,6 +1902,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
     [output_window_height] "+r"(output_window_height)
     :
     // Inputs.
+    [output_multiplier_ptr] "r"(output_multiplier_ptr),
+    [output_shift_ptr] "r"(output_shift_ptr),
     [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
     [input_depth] "r"(input_depth),
     [output_window_width] "r"(output_window_width),
@@ -1849,8 +1920,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
     "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
     "v30", "v31",
     // We use these general-purpose registers.
-    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
-    "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x0", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x10", "x11", "x12", "x13", "x14", "x15",
     "x19", "x20");
 #undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
 #undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
@@ -1869,8 +1940,10 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
 template <>
 struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                                       EdgeType::kCenter, 1, 1> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr,
                          const DepthwiseConvParams* params_ptr) {
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
@@ -1878,19 +1951,17 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         // Performs depthwise convolutions for an input window of size 1x1 and
         // padding of 1 across the full depth. Expects |input_ptr| and
         // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v10-v11 to hold output_shift.
         "ld1 {v8.8b}, [%[input_ptr]], #8\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
         "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
-        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
         "dup v26.8h, w9\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
-        "dup v27.4s, w10\n"
         "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
         "cmp x11, #16\n"
-        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w9\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "dup v29.4s, w10\n"
         "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.16b, w9\n"
         "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
@@ -1898,9 +1969,15 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         "dup v25.8h, w9\n"
 
         "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "saddw v8.8h, v26.8h, v8.8b\n"
         "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "saddw v0.8h, v25.8h, v0.8b\n"
+
+        // Loads output_multiplier & output_shift.
+        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v11.4s}, [%[output_shift_ptr]], #16\n"
 
         "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
 
@@ -1913,21 +1990,25 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
           "cmp x11, #16\n"
           "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
 
-          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
-          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
-          "sqrshl v16.4s, v16.4s, v29.4s\n"
-          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+          "sqrshl v16.4s, v16.4s, v10.4s\n"
+          "sqrshl v17.4s, v17.4s, v11.4s\n"
           "sqxtn v16.4h, v16.4s\n"
           "sqxtn2 v16.8h, v17.4s\n"
           "sqadd v16.8h, v16.8h, v28.8h\n"
-          "sqxtun v16.8b, v16.8h\n"
-          "umax v16.8b, v16.8b, v30.8b\n"
-          "umin v16.8b, v16.8b, v31.8b\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
           "st1 {v16.8b}, [%[output_ptr]], #8\n"
-          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
           "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "saddw v0.8h, v25.8h, v0.8b\n"
           "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v11.4s}, [%[output_shift_ptr]], #16\n"
 
           "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
 
@@ -1935,22 +2016,24 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         "smlal v16.4s, v0.4h, v8.4h\n"
         "smlal2 v17.4s, v0.8h, v8.8h\n"
 
-        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
-        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
-        "sqrshl v16.4s, v16.4s, v29.4s\n"
-        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+        "sqrshl v16.4s, v16.4s, v10.4s\n"
+        "sqrshl v17.4s, v17.4s, v11.4s\n"
 
         "sqxtn v16.4h, v16.4s\n"
         "sqxtn2 v16.8h, v17.4s\n"
         "sqadd v16.8h, v16.8h, v28.8h\n"
-        "sqxtun v16.8b, v16.8h\n"
-        "umax v16.8b, v16.8b, v30.8b\n"
-        "umin v16.8b, v16.8b, v31.8b\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
         "st1 {v16.8b}, [%[output_ptr]]\n"
         :
         // Outputs.
         [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
-        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [output_multiplier_ptr] "+r"(output_multiplier_ptr),
+        [output_shift_ptr] "+r"(output_shift_ptr)
         :
         // Inputs.
         [params_ptr] "r"(params_ptr)
@@ -1958,8 +2041,8 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         // Clobbers.
         "cc", "memory",
         // We use these NEON registers.
-        "v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
-        "v29", "v30", "v31",
+        "v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19", "v25",
+        "v26", "v28", "v30", "v31",
         // We use these general-purpose registers.
         "x9", "x10", "x11");
 #undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
@@ -1970,8 +2053,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 template <>
 struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                                       EdgeType::kCorner, 1, 1> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr,
                          const DepthwiseConvParams* params_ptr) {
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
@@ -1980,6 +2065,8 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         // padding of 1 across the full depth. Expects |input_ptr| and
         // |filter_ptr| to be pointing to the beginning of the 2x2 input and
         // filter values.
+        //
+        // Use v4-v5 to hold output_multiplier & v6-v7 to hold output_shift.
 
         // Load input and filter values.
         "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
@@ -2004,32 +2091,34 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 
         // Load constants.
         "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
-        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
         "dup v26.8h, w6\n"
         "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
-        "dup v27.4s, w7\n"
-        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w6\n"
         "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "dup v29.4s, w7\n"
         "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.16b, w6\n"
         "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
         "dup v31.16b, w7\n"
         "dup v25.8h, w6\n"
 
-        // Add input and filter offsets.
-        "uaddw v8.8h, v26.8h, v8.8b\n"
-        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v9.8h, v26.8h, v9.8b\n"
-        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v10.8h, v26.8h, v10.8b\n"
-        "uaddw v11.8h, v26.8h, v11.8b\n"
+        // Loads output_multiplier & output_shift.
+        "ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
+        "ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[output_shift_ptr]], #16\n"
 
-        "uaddw v0.8h, v25.8h, v0.8b\n"
-        "uaddw v1.8h, v25.8h, v1.8b\n"
-        "uaddw v2.8h, v25.8h, v2.8b\n"
-        "uaddw v3.8h, v25.8h, v3.8b\n"
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+
+        "saddw v0.8h, v25.8h, v0.8b\n"
+        "saddw v1.8h, v25.8h, v1.8b\n"
+        "saddw v2.8h, v25.8h, v2.8b\n"
+        "saddw v3.8h, v25.8h, v3.8b\n"
 
         "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
 
@@ -2054,27 +2143,31 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
           "ld1 {v11.8b}, [x14], #8\n"
           "ld1 {v3.8b}, [x11], #8\n"
 
-          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
-          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
-          "sqrshl v16.4s, v16.4s, v29.4s\n"
-          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+          "sqrshl v16.4s, v16.4s, v6.4s\n"
+          "sqrshl v17.4s, v17.4s, v7.4s\n"
           "sqxtn v16.4h, v16.4s\n"
           "sqxtn2 v16.8h, v17.4s\n"
           "sqadd v16.8h, v16.8h, v28.8h\n"
-          "sqxtun v16.8b, v16.8h\n"
-          "umax v16.8b, v16.8b, v30.8b\n"
-          "umin v16.8b, v16.8b, v31.8b\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
           "st1 {v16.8b}, [%[output_ptr]], #8\n"
-          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
           "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
           "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v10.8h, v26.8h, v10.8b\n"
-          "uaddw v11.8h, v26.8h, v11.8b\n"
-          "uaddw v0.8h, v25.8h, v0.8b\n"
-          "uaddw v1.8h, v25.8h, v1.8b\n"
-          "uaddw v2.8h, v25.8h, v2.8b\n"
-          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v0.8h, v25.8h, v0.8b\n"
+          "saddw v1.8h, v25.8h, v1.8b\n"
+          "saddw v2.8h, v25.8h, v2.8b\n"
+          "saddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
+          "ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[output_shift_ptr]], #16\n"
 
           "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
 
@@ -2088,22 +2181,24 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         "smlal v16.4s, v3.4h, v11.4h\n"
         "smlal2 v17.4s, v3.8h, v11.8h\n"
 
-        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
-        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
-        "sqrshl v16.4s, v16.4s, v29.4s\n"
-        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+        "sqrshl v16.4s, v16.4s, v6.4s\n"
+        "sqrshl v17.4s, v17.4s, v7.4s\n"
 
         "sqxtn v16.4h, v16.4s\n"
         "sqxtn2 v16.8h, v17.4s\n"
         "sqadd v16.8h, v16.8h, v28.8h\n"
-        "sqxtun v16.8b, v16.8h\n"
-        "umax v16.8b, v16.8b, v30.8b\n"
-        "umin v16.8b, v16.8b, v31.8b\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
         "st1 {v16.8b}, [%[output_ptr]]\n"
         :
         // Outputs.
         [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
-        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [output_multiplier_ptr] "+r"(output_multiplier_ptr),
+        [output_shift_ptr] "+r"(output_shift_ptr)
         :
         // Inputs.
         [params_ptr] "r"(params_ptr)
@@ -2111,8 +2206,8 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         // Clobbers.
         "cc", "memory",
         // We use these NEON registers.
-        "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
-        "v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v16", "v17","v18", "v19", "v25", "v26", "v28", "v30", "v31",
         // We use these general-purpose registers.
         "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
 #undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
@@ -2123,8 +2218,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 template <>
 struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                                       EdgeType::kHorizontal, 1, 1> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr,
                          const DepthwiseConvParams* params_ptr) {
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
@@ -2133,6 +2230,8 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         // padding of 1 across the full depth. Expects |input_ptr| and
         // |filter_ptr| to be pointing to the beginning of the 2x3 input and
         // filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v14-v15 to hold output_shift.
 
         // Load input and filter values.
         "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
@@ -2163,36 +2262,38 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 
         // Load constants.
         "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
-        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
         "dup v26.8h, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
-        "dup v27.4s, w13\n"
-        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "dup v29.4s, w13\n"
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.8b, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
         "dup v31.8b, w13\n"
         "dup v25.8h, w12\n"
 
-        // Add input and filter offsets.
-        "uaddw v8.8h, v26.8h, v8.8b\n"
-        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v9.8h, v26.8h, v9.8b\n"
-        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v10.8h, v26.8h, v10.8b\n"
-        "uaddw v11.8h, v26.8h, v11.8b\n"
-        "uaddw v12.8h, v26.8h, v12.8b\n"
-        "uaddw v13.8h, v26.8h, v13.8b\n"
+        // Loads output_multiplier & output_shift.
+        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
 
-        "uaddw v0.8h, v25.8h, v0.8b\n"
-        "uaddw v1.8h, v25.8h, v1.8b\n"
-        "uaddw v2.8h, v25.8h, v2.8b\n"
-        "uaddw v3.8h, v25.8h, v3.8b\n"
-        "uaddw v4.8h, v25.8h, v4.8b\n"
-        "uaddw v5.8h, v25.8h, v5.8b\n"
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+
+        "saddw v0.8h, v25.8h, v0.8b\n"
+        "saddw v1.8h, v25.8h, v1.8b\n"
+        "saddw v2.8h, v25.8h, v2.8b\n"
+        "saddw v3.8h, v25.8h, v3.8b\n"
+        "saddw v4.8h, v25.8h, v4.8b\n"
+        "saddw v5.8h, v25.8h, v5.8b\n"
 
         "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
 
@@ -2229,35 +2330,39 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
           "smlal2 v17.4s, v5.8h, v13.8h\n"
           "ld1 {v13.8b}, [x13]\n"
 
-          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v16.4s, v16.4s, v6.4s\n"
           "ld1 {v3.8b}, [x10], x7\n"
-          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v7.4s\n"
           "ld1 {v4.8b}, [x10], x7\n"
-          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "sqrshl v16.4s, v16.4s, v14.4s\n"
           "ld1 {v5.8b}, [x10]\n"
-          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqrshl v17.4s, v17.4s, v15.4s\n"
           "sqxtn v16.4h, v16.4s\n"
           "sqxtn2 v16.8h, v17.4s\n"
           "sqadd v16.8h, v16.8h, v28.8h\n"
-          "sqxtun v16.8b, v16.8h\n"
-          "umax v16.8b, v16.8b, v30.8b\n"
-          "umin v16.8b, v16.8b, v31.8b\n"
-          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
           "st1 {v16.8b}, [%[output_ptr]], #8\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
-          "uaddw v10.8h, v26.8h, v10.8b\n"
-          "uaddw v11.8h, v26.8h, v11.8b\n"
-          "uaddw v12.8h, v26.8h, v12.8b\n"
-          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
 
-          "uaddw v0.8h, v25.8h, v0.8b\n"
-          "uaddw v1.8h, v25.8h, v1.8b\n"
-          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "saddw v0.8h, v25.8h, v0.8b\n"
+          "saddw v1.8h, v25.8h, v1.8b\n"
+          "saddw v2.8h, v25.8h, v2.8b\n"
           "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "saddw v3.8h, v25.8h, v3.8b\n"
           "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v4.8h, v25.8h, v4.8b\n"
-          "uaddw v5.8h, v25.8h, v5.8b\n"
+          "saddw v4.8h, v25.8h, v4.8b\n"
+          "saddw v5.8h, v25.8h, v5.8b\n"
+          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
 
           "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
 
@@ -2275,21 +2380,23 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         "smlal v16.4s, v5.4h, v13.4h\n"
         "smlal2 v17.4s, v5.8h, v13.8h\n"
 
-        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
-        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
-        "sqrshl v16.4s, v16.4s, v29.4s\n"
-        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+        "sqrshl v16.4s, v16.4s, v14.4s\n"
+        "sqrshl v17.4s, v17.4s, v15.4s\n"
         "sqxtn v16.4h, v16.4s\n"
         "sqxtn2 v16.8h, v17.4s\n"
         "sqadd v16.8h, v16.8h, v28.8h\n"
-        "sqxtun v16.8b, v16.8h\n"
-        "umax v16.8b, v16.8b, v30.8b\n"
-        "umin v16.8b, v16.8b, v31.8b\n"
+        "sqxtn v16.8b, v16.8h\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
         "st1 {v16.8b}, [%[output_ptr]]\n"
         :
         // Outputs.
         [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
-        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [output_multiplier_ptr] "+r"(output_multiplier_ptr),
+        [output_shift_ptr] "+r"(output_shift_ptr)
         :
         // Inputs.
         [params_ptr] "r"(params_ptr)
@@ -2297,9 +2404,9 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         // Clobbers.
         "cc", "memory",
         // We use these NEON registers.
-        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
-        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
-        "v30", "v31",
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v25",
+        "v26", "v28", "v30", "v31",
         // We use these general-purpose registers.
         "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
 #undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
@@ -2309,8 +2416,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 template <>
 struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                                       EdgeType::kVertical, 1, 1> {
-  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                         const int32* bias_ptr, uint8* output_ptr,
+  static inline void Run(const int32* output_multiplier_ptr,
+                         const int32* output_shift_ptr, const int8* input_ptr,
+                         const int8* filter_ptr, const int32* bias_ptr,
+                         int8* output_ptr,
                          const DepthwiseConvParams* params_ptr) {
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
@@ -2319,6 +2428,8 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         // padding of 1 across the full depth. Expects |input_ptr| and
         // |filter_ptr| to be pointing to the beginning of the 3x2 input and
         // filter values.
+        //
+        // Use v6-v7 to hold output_multiplier & v14-v15 to hold output_shift.
 
         // Load input and filter values.
         "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
@@ -2351,36 +2462,38 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 
         // Load constants.
         "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
-        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
         "dup v26.8h, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
-        "dup v27.4s, w13\n"
-        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "dup v28.8h, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
-        "dup v29.4s, w13\n"
         "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
         "dup v30.8b, w12\n"
         "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
         "dup v31.8b, w13\n"
         "dup v25.8h, w12\n"
 
-        // Add input and filter offsets.
-        "uaddw v8.8h, v26.8h, v8.8b\n"
-        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v9.8h, v26.8h, v9.8b\n"
-        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "uaddw v10.8h, v26.8h, v10.8b\n"
-        "uaddw v11.8h, v26.8h, v11.8b\n"
-        "uaddw v12.8h, v26.8h, v12.8b\n"
-        "uaddw v13.8h, v26.8h, v13.8b\n"
+        // Loads output_multiplier & output_shift.
+        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
+        "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+        "ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
 
-        "uaddw v0.8h, v25.8h, v0.8b\n"
-        "uaddw v1.8h, v25.8h, v1.8b\n"
-        "uaddw v2.8h, v25.8h, v2.8b\n"
-        "uaddw v3.8h, v25.8h, v3.8b\n"
-        "uaddw v4.8h, v25.8h, v4.8b\n"
-        "uaddw v5.8h, v25.8h, v5.8b\n"
+        // Add input and filter offsets.
+        "saddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "saddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "saddw v10.8h, v26.8h, v10.8b\n"
+        "saddw v11.8h, v26.8h, v11.8b\n"
+        "saddw v12.8h, v26.8h, v12.8b\n"
+        "saddw v13.8h, v26.8h, v13.8b\n"
+
+        "saddw v0.8h, v25.8h, v0.8b\n"
+        "saddw v1.8h, v25.8h, v1.8b\n"
+        "saddw v2.8h, v25.8h, v2.8b\n"
+        "saddw v3.8h, v25.8h, v3.8b\n"
+        "saddw v4.8h, v25.8h, v4.8b\n"
+        "saddw v5.8h, v25.8h, v5.8b\n"
 
         "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
 
@@ -2419,35 +2532,39 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
           "smlal2 v17.4s, v5.8h, v13.8h\n"
           "ld1 {v13.8b}, [x14]\n"
 
-          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v16.4s, v16.4s, v6.4s\n"
           "ld1 {v3.8b}, [x9]\n"
-          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v7.4s\n"
           "ld1 {v4.8b}, [x10], x6\n"
-          "sqrshl v16.4s, v16.4s, v29.4s\n"
+          "sqrshl v16.4s, v16.4s, v14.4s\n"
           "ld1 {v5.8b}, [x10]\n"
-          "sqrshl v17.4s, v17.4s, v29.4s\n"
+          "sqrshl v17.4s, v17.4s, v15.4s\n"
           "sqxtn v16.4h, v16.4s\n"
           "sqxtn2 v16.8h, v17.4s\n"
           "sqadd v16.8h, v16.8h, v28.8h\n"
-          "sqxtun v16.8b, v16.8h\n"
-          "umax v16.8b, v16.8b, v30.8b\n"
-          "umin v16.8b, v16.8b, v31.8b\n"
-          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "sqxtn v16.8b, v16.8h\n"
+          "smax v16.8b, v16.8b, v30.8b\n"
+          "smin v16.8b, v16.8b, v31.8b\n"
+          "saddw v8.8h, v26.8h, v8.8b\n"
           "st1 {v16.8b}, [%[output_ptr]], #8\n"
-          "uaddw v9.8h, v26.8h, v9.8b\n"
-          "uaddw v10.8h, v26.8h, v10.8b\n"
-          "uaddw v11.8h, v26.8h, v11.8b\n"
-          "uaddw v12.8h, v26.8h, v12.8b\n"
-          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "saddw v9.8h, v26.8h, v9.8b\n"
+          "saddw v10.8h, v26.8h, v10.8b\n"
+          "saddw v11.8h, v26.8h, v11.8b\n"
+          "saddw v12.8h, v26.8h, v12.8b\n"
+          "saddw v13.8h, v26.8h, v13.8b\n"
 
-          "uaddw v0.8h, v25.8h, v0.8b\n"
-          "uaddw v1.8h, v25.8h, v1.8b\n"
-          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "saddw v0.8h, v25.8h, v0.8b\n"
+          "saddw v1.8h, v25.8h, v1.8b\n"
+          "saddw v2.8h, v25.8h, v2.8b\n"
           "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "saddw v3.8h, v25.8h, v3.8b\n"
           "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-          "uaddw v4.8h, v25.8h, v4.8b\n"
-          "uaddw v5.8h, v25.8h, v5.8b\n"
+          "saddw v4.8h, v25.8h, v4.8b\n"
+          "saddw v5.8h, v25.8h, v5.8b\n"
+          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
+          "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
+          "ld1 {v15.4s}, [%[output_shift_ptr]], #16\n"
 
           "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
 
@@ -2465,22 +2582,24 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         "smlal v16.4s, v5.4h, v13.4h\n"
         "smlal2 v17.4s, v5.8h, v13.8h\n"
 
-        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
-        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
-        "sqrshl v16.4s, v16.4s, v29.4s\n"
-        "sqrshl v17.4s, v17.4s, v29.4s\n"
+        "sqrdmulh v16.4s, v16.4s, v6.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v7.4s\n"
+        "sqrshl v16.4s, v16.4s, v14.4s\n"
+        "sqrshl v17.4s, v17.4s, v15.4s\n"
         "sqxtn v16.4h, v16.4s\n"
         "sqxtn2 v16.8h, v17.4s\n"
         "sqadd v16.8h, v16.8h, v28.8h\n"
-        "sqxtun v16.8b, v16.8h\n"
+        "sqxtn v16.8b, v16.8h\n"
         // TODO(b/129852264): Improve testing coverage.
-        "umax v16.8b, v16.8b, v30.8b\n"
-        "umin v16.8b, v16.8b, v31.8b\n"
+        "smax v16.8b, v16.8b, v30.8b\n"
+        "smin v16.8b, v16.8b, v31.8b\n"
         "st1 {v16.8b}, [%[output_ptr]]\n"
         :
         // Outputs.
         [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
-        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr),
+        [output_multiplier_ptr] "+r"(output_multiplier_ptr),
+        [output_shift_ptr] "+r"(output_shift_ptr)
         :
         // Inputs.
         [params_ptr] "r"(params_ptr)
@@ -2488,9 +2607,9 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
         // Clobbers.
         "cc", "memory",
         // We use these NEON registers.
-        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
-        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
-        "v30", "v31",
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v25",
+        "v26", "v28", "v30", "v31",
         // We use these general-purpose registers.
         "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
 #undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
@@ -2522,13 +2641,16 @@ struct DepthwiseConvThroughDepthPerChannel {
   // small binary size. We use a DepthwiseConvParams struct for read only params
   // to minimize call overhead.
   static void __attribute__((noinline))
-  Run(const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
-      uint8* output_ptr, int64_t start_depth, int64_t end_depth,
+  Run(const int32* output_multiplier_ptr, const int32* output_shift_ptr,
+      const int8* input_ptr, const int8* filter_ptr, const int32* bias_ptr,
+      int8* output_ptr, int64_t start_depth, int64_t end_depth,
       int64_t input_depth, int64_t input_row_size, int32 output_window_height,
       int32 output_window_width, const DepthwiseConvParams& params) {
     for (; start_depth <= end_depth - 8; start_depth += 8) {
       DepthwiseConvWindowPerChannel<output_rounding, 8, kStrideWidth,
-                                    kStrideHeight>::Run(input_ptr, filter_ptr,
+                                    kStrideHeight>::Run(output_multiplier_ptr,
+                                                        output_shift_ptr,
+                                                        input_ptr, filter_ptr,
                                                         bias_ptr, output_ptr,
                                                         input_depth,
                                                         input_row_size,
@@ -2539,6 +2661,8 @@ struct DepthwiseConvThroughDepthPerChannel {
       output_ptr += 8;
       filter_ptr += 8;
       bias_ptr += 8;
+      output_multiplier_ptr += 8;
+      output_shift_ptr += 8;
     }
   }
 };
@@ -2550,11 +2674,13 @@ struct DepthwiseConvMultiRowPerChannel {
       DepthwiseConvThroughDepthPerChannel<output_rounding, kStrideWidth,
                                           kStrideHeight>;
 
-  static inline void Run(const uint8* input_data, int32 start_x, int32 end_x,
-                         const uint8* filter_data, const int32* bias_data,
-                         uint8* output_data, const DepthwiseConvParams& params,
+  static inline void Run(const int32* output_multiplier,
+                         const int32* output_shift, const int8* input_data,
+                         int32 start_x, int32 end_x, const int8* filter_data,
+                         const int32* bias_data, int8* output_data,
+                         const DepthwiseConvParams& params,
                          const ShuffleParams& shuffle_params,
-                         uint8* shuffle_workspace) {
+                         int8* shuffle_workspace) {
     TFLITE_DCHECK(
         shuffle_params.input_height ==
         get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
@@ -2575,18 +2701,20 @@ struct DepthwiseConvMultiRowPerChannel {
         (params.output_depth <= 64 && params.input_width > 150)) {
       for (; out_x <= (end_x - shuffle_params.output_width);
            out_x += shuffle_params.output_width) {
-        const uint8* input_ptr = input_data;
+        const int8* input_ptr = input_data;
         const int32* bias_ptr = bias_data;
-        const uint8* filter_ptr = filter_data;
-        uint8* output_ptr = output_data;
+        const int32* output_multiplier_ptr = output_multiplier;
+        const int32* output_shift_ptr = output_shift;
+        const int8* filter_ptr = filter_data;
+        int8* output_ptr = output_data;
         int64_t depth = 0;
         const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
 
         for (; depth <= params.output_depth - 64; depth += 64) {
           // Preload.
-          const uint8* h_ptr = input_ptr;
+          const int8* h_ptr = input_ptr;
           for (int32 i = 0; i < shuffle_params.input_height; i++) {
-            const uint8* ptr = h_ptr;
+            const int8* ptr = h_ptr;
             for (int32 j = 0; j < shuffle_params.input_width; j++) {
               optimized_ops_preload_l1_keep(ptr);
               ptr += params.input_depth;
@@ -2598,7 +2726,8 @@ struct DepthwiseConvMultiRowPerChannel {
           ShuffleInput(input_ptr, params.input_depth, params.input_width,
                        params.input_height, 64, shuffle_params.input_width,
                        shuffle_params.input_height, shuffle_workspace);
-          ConvKernel::Run(shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
+          ConvKernel::Run(output_multiplier_ptr, output_shift_ptr,
+                          shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
                           0, 64, 64, shuffle_row_size,
                           shuffle_params.output_height,
                           shuffle_params.output_width, params);
@@ -2606,12 +2735,14 @@ struct DepthwiseConvMultiRowPerChannel {
           output_ptr += 64;
           filter_ptr += 64;
           bias_ptr += 64;
+          output_multiplier_ptr += 64;
+          output_shift_ptr += 64;
         }
 
         // Preload.
-        const uint8* h_ptr = input_ptr;
+        const int8* h_ptr = input_ptr;
         for (int32 i = 0; i < shuffle_params.input_height; i++) {
-          const uint8* ptr = h_ptr;
+          const int8* ptr = h_ptr;
           for (int32 j = 0; j < shuffle_params.input_width; j++) {
             optimized_ops_preload_l1_keep(ptr);
             ptr += params.input_depth;
@@ -2620,7 +2751,8 @@ struct DepthwiseConvMultiRowPerChannel {
         }
 
         // Handle leftover depth.
-        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, depth,
+        ConvKernel::Run(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                        filter_ptr, bias_ptr, output_ptr, depth,
                         params.output_depth, params.input_depth,
                         params.input_row_size, shuffle_params.output_height,
                         shuffle_params.output_width, params);
@@ -2633,10 +2765,11 @@ struct DepthwiseConvMultiRowPerChannel {
 
     const int32 output_leftover_width = end_x - out_x;
     if (output_leftover_width > 0) {
-      ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0,
-                      params.output_depth, params.input_depth,
-                      params.input_row_size, shuffle_params.output_height,
-                      output_leftover_width, params);
+      ConvKernel::Run(output_multiplier, output_shift, input_data, filter_data,
+                      bias_data, output_data, 0, params.output_depth,
+                      params.input_depth, params.input_row_size,
+                      shuffle_params.output_height, output_leftover_width,
+                      params);
     }
   }
 };
@@ -2649,14 +2782,17 @@ struct DepthwiseConvMultiRowPerChannel {
 //   * Vertical edges.
 template <DepthwiseConvOutputRounding output_rounding>
 inline void DepthwiseConvHandlePaddingPerChannel(
-    const uint8* input_data, const uint8* filter_data, const int32* bias_data,
-    uint8* output_data, const DepthwiseConvParams& params) {
+    const int32* output_multiplier_ptr, const int32* output_shift_ptr,
+    const int8* input_data, const int8* filter_data, const int32* bias_data,
+    int8* output_data, const DepthwiseConvParams& params) {
   if (params.input_width == 1 && params.input_height == 1) {
-    const uint8* filter_ptr =
+    const int8* filter_ptr =
         filter_data + params.filter_row_size + params.output_depth;
     DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCenter, 1,
-                                   1>::Run(input_data, filter_ptr, bias_data,
-                                           output_data, &params);
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_data,
+                                           filter_ptr, bias_data, output_data,
+                                           &params);
     return;
   }
 
@@ -2666,13 +2802,14 @@ inline void DepthwiseConvHandlePaddingPerChannel(
   const int32 out_y_end_corner = params.output_height - 1;
 
   // Handle top row.
-  const uint8* input_ptr = input_data;
-  const uint8* filter_ptr =
+  const int8* input_ptr = input_data;
+  const int8* filter_ptr =
       filter_data + params.filter_row_size + params.output_depth;
-  uint8* output_ptr = output_data;
+  int8* output_ptr = output_data;
 
   DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
-      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+      output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
+      output_ptr, &params);
 
   input_ptr += (params.stride_width - 1) * params.input_depth;
   filter_ptr = filter_data + params.filter_row_size;
@@ -2681,14 +2818,17 @@ inline void DepthwiseConvHandlePaddingPerChannel(
   for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
        out_x++) {
     DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
-                                   1>::Run(input_ptr, filter_ptr, bias_data,
-                                           output_ptr, &params);
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_ptr,
+                                           filter_ptr, bias_data, output_ptr,
+                                           &params);
     input_ptr += params.stride_width * params.input_depth;
     output_ptr += params.output_depth;
   }
 
   DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
-      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+      output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
+      output_ptr, &params);
 
   // Handle left side.
   input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
@@ -2698,8 +2838,10 @@ inline void DepthwiseConvHandlePaddingPerChannel(
   for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
        out_y++) {
     DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
-                                   1>::Run(input_ptr, filter_ptr, bias_data,
-                                           output_ptr, &params);
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_ptr,
+                                           filter_ptr, bias_data, output_ptr,
+                                           &params);
     input_ptr += params.stride_width * params.input_row_size;
     output_ptr += params.output_row_size;
   }
@@ -2714,8 +2856,10 @@ inline void DepthwiseConvHandlePaddingPerChannel(
   for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
        out_y++) {
     DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kVertical, 1,
-                                   1>::Run(input_ptr, filter_ptr, bias_data,
-                                           output_ptr, &params);
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_ptr,
+                                           filter_ptr, bias_data, output_ptr,
+                                           &params);
     input_ptr += params.stride_width * params.input_row_size;
     output_ptr += params.output_row_size;
   }
@@ -2727,7 +2871,8 @@ inline void DepthwiseConvHandlePaddingPerChannel(
       output_data + (params.output_height - 1) * params.output_row_size;
 
   DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
-      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+      output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
+      output_ptr, &params);
 
   input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
   filter_ptr = filter_data;
@@ -2736,23 +2881,27 @@ inline void DepthwiseConvHandlePaddingPerChannel(
   for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
        out_x++) {
     DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kHorizontal, 1,
-                                   1>::Run(input_ptr, filter_ptr, bias_data,
-                                           output_ptr, &params);
+                                   1>::Run(output_multiplier_ptr,
+                                           output_shift_ptr, input_ptr,
+                                           filter_ptr, bias_data, output_ptr,
+                                           &params);
     input_ptr += params.stride_width * params.input_depth;
     output_ptr += params.output_depth;
   }
 
   DepthwiseConvPartialPerChannel<output_rounding, EdgeType::kCorner, 1, 1>::Run(
-      input_ptr, filter_ptr, bias_data, output_ptr, &params);
+      output_multiplier_ptr, output_shift_ptr, input_ptr, filter_ptr, bias_data,
+      output_ptr, &params);
 }
 
 template <DepthwiseConvOutputRounding output_rounding>
 inline void DepthwiseConv3x3FilterPerChannel(
-    const DepthwiseParams& rt_params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
+    const DepthwiseParams& rt_params, const int32* output_multiplier_ptr,
+    const int32* output_shift_ptr, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
+    int thread_start, int thread_end, int thread_dim) {
   DepthwiseConvParams params;
 
   const int32 stride_width = rt_params.stride_width;
@@ -2765,8 +2914,6 @@ inline void DepthwiseConv3x3FilterPerChannel(
   const int32 input_offset = rt_params.input_offset;
   const int32 filter_offset = rt_params.weights_offset;
   const int32 output_offset = rt_params.output_offset;
-  const int32 output_multiplier = rt_params.output_multiplier;
-  const int32 output_shift = rt_params.output_shift;
 
   params.input_depth = input_shape.Dims(3);
   params.input_width = input_shape.Dims(2);
@@ -2781,10 +2928,11 @@ inline void DepthwiseConv3x3FilterPerChannel(
   params.output_row_size = params.output_depth * params.output_width;
   params.output_offset = output_offset;
   params.filter_offset = filter_offset;
-  params.output_multiplier = output_multiplier;
-  params.output_right_shift = output_shift;
   params.output_activation_min = output_activation_min;
   params.output_activation_max = output_activation_max;
+  // TODO(renjieliu): Remove these once all per-channel cases have been handled.
+  params.output_multiplier = output_multiplier_ptr[0];
+  params.output_right_shift = output_shift_ptr[0];
 
   const int32 filter_height = filter_shape.Dims(1);
   const int32 filter_width = filter_shape.Dims(2);
@@ -2837,7 +2985,7 @@ inline void DepthwiseConv3x3FilterPerChannel(
   // allocated on the stack. Eventually we will want to move it to the heap
   // and have it allocated outside of this function, like the im2col_array
   // used in gemmlowp.
-  uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
+  int8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
 
   int batch_start = 0;
   int batch_end = batches;
@@ -2861,8 +3009,8 @@ inline void DepthwiseConv3x3FilterPerChannel(
 
   for (int32 b = batch_start; b < batch_end; ++b) {
     // input_ptr and output_ptr point to the start of each batch
-    const uint8* input_ptr = input_data + b * input_batch_size;
-    uint8* output_ptr = output_data + b * output_batch_size;
+    const int8* input_ptr = input_data + b * input_batch_size;
+    int8* output_ptr = output_data + b * output_batch_size;
 
     int32 out_x = 0;
     int32 out_y = row_start;
@@ -2871,7 +3019,8 @@ inline void DepthwiseConv3x3FilterPerChannel(
 
     if (pad_width == 1 && pad_height == 1) {
       DepthwiseConvHandlePaddingPerChannel<output_rounding>(
-          input_ptr, filter_data, bias_data, output_ptr, params);
+          output_multiplier_ptr, output_shift_ptr, input_ptr, filter_data,
+          bias_data, output_ptr, params);
 
       // Update extents now that the edges have been handled.
       out_x = 1;
@@ -2905,9 +3054,9 @@ inline void DepthwiseConv3x3FilterPerChannel(
     // Handle 8 rows at a time.
     if (params.input_width < four_row_shuffle_params.input_width) {
       for (; out_y <= end_y - 8; out_y += 8) {
-        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
-                           output_ptr, params, eight_row_shuffle_params,
-                           shuffle_workspace);
+        conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                           out_x, end_x, filter_data, bias_data, output_ptr,
+                           params, eight_row_shuffle_params, shuffle_workspace);
         input_ptr += 8 * stride_height * params.input_row_size;
         output_ptr += 8 * params.output_row_size;
       }
@@ -2916,9 +3065,9 @@ inline void DepthwiseConv3x3FilterPerChannel(
     // Handle 4 rows at a time.
     if (params.input_width < two_row_shuffle_params.input_width) {
       for (; out_y <= end_y - 4; out_y += 4) {
-        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
-                           output_ptr, params, four_row_shuffle_params,
-                           shuffle_workspace);
+        conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                           out_x, end_x, filter_data, bias_data, output_ptr,
+                           params, four_row_shuffle_params, shuffle_workspace);
         input_ptr += 4 * stride_height * params.input_row_size;
         output_ptr += 4 * params.output_row_size;
       }
@@ -2926,18 +3075,18 @@ inline void DepthwiseConv3x3FilterPerChannel(
 
     // Handle 2 rows at a time.
     for (; out_y <= end_y - 2; out_y += 2) {
-      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
-                         output_ptr, params, two_row_shuffle_params,
-                         shuffle_workspace);
+      conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                         out_x, end_x, filter_data, bias_data, output_ptr,
+                         params, two_row_shuffle_params, shuffle_workspace);
       input_ptr += 2 * stride_height * params.input_row_size;
       output_ptr += 2 * params.output_row_size;
     }
 
     // Handle one row at a time.
     for (; out_y < end_y; out_y++) {
-      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
-                         output_ptr, params, one_row_shuffle_params,
-                         shuffle_workspace);
+      conv_multirow_func(output_multiplier_ptr, output_shift_ptr, input_ptr,
+                         out_x, end_x, filter_data, bias_data, output_ptr,
+                         params, one_row_shuffle_params, shuffle_workspace);
       input_ptr += stride_height * params.input_row_size;
       output_ptr += params.output_row_size;
     }
diff --git a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
index 523d2239f7d..f1482f71c4c 100644
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -165,7 +165,7 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
   DepthwiseConvImpl(op_params, DimsToShape(input_dims), input_data,
                     DimsToShape(filter_dims), filter_data,
                     DimsToShape(bias_dims), bias_data, output_shape,
-                    output_data, /*thread_start=*/0,
+                    output_data, nullptr, /*thread_start=*/0,
                     /*thread_end=*/output_height, /*thread_dim=*/1);
 }
 
@@ -211,6 +211,75 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
                     depth_multiplier, output_data, output_dims);
 }
 
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void LegacyDepthwiseConvWithRounding(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_GE(dilation_width_factor, 1);
+  TFLITE_DCHECK_GE(dilation_height_factor, 1);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int output_shift = params.output_shift;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (depthwise_conv::Fast3x3FilterKernelSupported(
+          input_shape, filter_shape, stride_width, stride_height,
+          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+          depth_multiplier, output_shape, output_shift)) {
+    gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
+    depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
+        params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+        bias_data, output_shape, output_data, thread_start, thread_end,
+        thread_dim);
+    return;
+  }
+#endif
+
+  gemmlowp::ScopedProfilingLabel specialized_label(
+      "DepthwiseConv/8bit/General");
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
+                                       filter_shape, filter_data, bias_shape,
+                                       bias_data, output_shape, output_data,
+                                       thread_start, thread_end, thread_dim);
+}
+
+inline void LegacyDepthwiseConvImpl(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data, int thread_start, int thread_end, int thread_dim) {
+  return LegacyDepthwiseConvWithRounding<
+      DepthwiseConvOutputRounding::kAwayFromZero>(
+      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+      bias_data, output_shape, output_data, thread_start, thread_end,
+      thread_dim);
+}
+
 inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
                           int32 input_offset, const uint8* filter_data,
                           const Dims<4>& filter_dims, int32 filter_offset,
@@ -244,11 +313,11 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   const RuntimeShape output_shape = DimsToShape(output_dims);
   const int output_height = output_shape.Dims(1);
 
-  DepthwiseConvImpl(op_params, DimsToShape(input_dims), input_data,
-                    DimsToShape(filter_dims), filter_data,
-                    DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
-                    output_data, /*thread_start=*/0,
-                    /*thread_end=*/output_height, /*thread_dim=*/1);
+  LegacyDepthwiseConvImpl(
+      op_params, DimsToShape(input_dims), input_data, DimsToShape(filter_dims),
+      filter_data, DimsToShape(bias_dims), bias_data, DimsToShape(output_dims),
+      output_data, /*thread_start=*/0,
+      /*thread_end=*/output_height, /*thread_dim=*/1);
 }
 
 inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
@@ -333,9 +402,10 @@ struct LegacyDepthwiseConvWorkerTask : public gemmlowp::Task {
         thread_dim_(thread_dim) {}
 
   void Run() override {
-    DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
-                      filter_data_, bias_shape_, bias_data_, output_shape_,
-                      output_data_, thread_start_, thread_end_, thread_dim_);
+    LegacyDepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
+                            filter_data_, bias_shape_, bias_data_,
+                            output_shape_, output_data_, thread_start_,
+                            thread_end_, thread_dim_);
   }
 
  private:
@@ -353,6 +423,20 @@ struct LegacyDepthwiseConvWorkerTask : public gemmlowp::Task {
   int thread_dim_;
 };
 
+inline int HowManyConvThreads(const RuntimeShape& output_shape,
+                              const RuntimeShape& filter_shape,
+                              int thread_dim) {
+  constexpr int kMinMulPerThread = 8;
+  const int output_units = output_shape.Dims(thread_dim);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_mul_per_unit =
+      FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
+  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
+  int thread_count = output_units / min_units_per_thread;
+  return thread_count;
+}
+
 inline void DepthwiseConv(
     const DepthwiseParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -385,10 +469,10 @@ inline void DepthwiseConv(
   thread_count = std::max(1, std::min(thread_count, max_threads));
 
   if (thread_count == 1) {
-    DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
-                      filter_data, bias_shape, bias_data, output_shape,
-                      output_data, /*thread_start=*/0,
-                      /*thread_end=*/output_rows, /*thread_dim=*/1);
+    LegacyDepthwiseConvImpl(params, input_shape, input_data, filter_shape,
+                            filter_data, bias_shape, bias_data, output_shape,
+                            output_data, /*thread_start=*/0,
+                            /*thread_end=*/output_rows, /*thread_dim=*/1);
   } else {
     std::vector<gemmlowp::Task*> tasks(thread_count);
     int thread_start = 0;
@@ -507,6 +591,18 @@ inline void DepthwiseConvPerChannel(
   }
 }
 
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
+                    bias_shape, bias_data, output_shape, output_data, nullptr,
+                    /*thread_start=*/0,
+                    /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
+}
+
 inline void AddBiasAndEvalActivationFunction(const float* bias_data,
                                              const Dims<4>& bias_dims,
                                              float* array_data,
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index b4171de7395..ec1af798dfd 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -1205,6 +1205,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                  uint8* output_data, const RuntimeShape& im2col_shape,
                  uint8* im2col_data, CpuBackendContext* cpu_backend_context) {
   gemmlowp::ScopedProfilingLabel label("Conv/8bit");
+
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
@@ -5470,6 +5471,212 @@ inline void ResizeNearestNeighbor(
   }
 }
 
+template <typename input_type, typename output_type>
+inline void Requantize(const input_type* input_data, int32_t size,
+                       int32_t effective_scale_multiplier,
+                       int32_t effective_scale_shift, int32_t input_zeropoint,
+                       int32_t output_zeropoint, output_type* output_data) {
+  reference_ops::Requantize(input_data, size, effective_scale_multiplier,
+                            effective_scale_shift, input_zeropoint,
+                            output_zeropoint, output_data);
+}
+
+#ifdef USE_NEON
+
+inline void MultiplyByQuantizedMultiplier4Rows(
+    int32x4_t input_val_1, int32x4_t input_val_2, int32x4_t input_val_3,
+    int32x4_t input_val_4, int32_t multiplier, int32_t left_shifted_one,
+    int32_t right_shift, int32x4_t* result_val_1, int32x4_t* result_val_2,
+    int32x4_t* result_val_3, int32x4_t* result_val_4) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int32x4_t left_shifted_one_dup = vdupq_n_s32(left_shifted_one);
+  *result_val_1 = RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(
+          vmulq_s32(input_val_1, left_shifted_one_dup), multiplier),
+      right_shift);
+  *result_val_2 = RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(
+          vmulq_s32(input_val_2, left_shifted_one_dup), multiplier),
+      right_shift);
+  *result_val_3 = RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(
+          vmulq_s32(input_val_3, left_shifted_one_dup), multiplier),
+      right_shift);
+  *result_val_4 = RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(
+          vmulq_s32(input_val_4, left_shifted_one_dup), multiplier),
+      right_shift);
+}
+
+#endif
+
+template <>
+inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift,
+                                        int32_t input_zeropoint,
+                                        int32_t output_zeropoint,
+                                        uint8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Requantize/Int8ToUint8");
+
+  static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  // Left shift & right shift unconditionally.
+  int32_t left_shifted_one =
+      effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
+  int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
+
+  for (; i <= size - 16; i += 16) {
+    int8x16_t input_vec = vld1q_s8(input_data + i);
+    int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+    int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+    int32x4_t input_val_1 = vmovl_s16(vget_low_s16(first_half));
+    int32x4_t input_val_2 = vmovl_s16(vget_high_s16(first_half));
+    int32x4_t input_val_3 = vmovl_s16(vget_low_s16(second_half));
+    int32x4_t input_val_4 = vmovl_s16(vget_high_s16(second_half));
+    input_val_1 = vaddq_s32(input_val_1, input_zero_point_dup);
+    input_val_2 = vaddq_s32(input_val_2, input_zero_point_dup);
+    input_val_3 = vaddq_s32(input_val_3, input_zero_point_dup);
+    input_val_4 = vaddq_s32(input_val_4, input_zero_point_dup);
+
+    int32x4_t result_val_1, result_val_2, result_val_3, result_val_4;
+    MultiplyByQuantizedMultiplier4Rows(
+        input_val_1, input_val_2, input_val_3, input_val_4,
+        effective_scale_multiplier, left_shifted_one, right_shift,
+        &result_val_1, &result_val_2, &result_val_3, &result_val_4);
+
+    result_val_1 = vaddq_s32(result_val_1, output_zero_point_dup);
+    result_val_2 = vaddq_s32(result_val_2, output_zero_point_dup);
+    result_val_3 = vaddq_s32(result_val_3, output_zero_point_dup);
+    result_val_4 = vaddq_s32(result_val_4, output_zero_point_dup);
+    result_val_1 = vmaxq_s32(vminq_s32(result_val_1, max_val_dup), min_val_dup);
+    result_val_2 = vmaxq_s32(vminq_s32(result_val_2, max_val_dup), min_val_dup);
+    result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
+    result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
+
+    uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result_val_1);
+    uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result_val_2);
+    uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result_val_3);
+    uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result_val_4);
+
+    uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+    uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+    uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+    uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+    uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
+    uint16x8_t output_second_half =
+        vcombine_u16(narrowed_val_3, narrowed_val_4);
+    uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+    uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+    uint8x16_t result = vcombine_u8(narrowed_first_half, narrowed_second_half);
+    vst1q_u8(output_data + i, result);
+  }
+
+#endif
+  for (; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
+template <>
+inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift,
+                                        int32_t input_zeropoint,
+                                        int32_t output_zeropoint,
+                                        int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Requantize/UInt8ToInt8");
+
+  static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  // Left shift & right shift unconditionally.
+  int32_t left_shifted_one =
+      effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
+  int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
+
+  for (; i <= size - 16; i += 16) {
+    uint8x16_t input_vec = vld1q_u8(input_data + i);
+    uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+    uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+    int32x4_t input_val_1 =
+        vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
+    int32x4_t input_val_2 =
+        vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
+    int32x4_t input_val_3 =
+        vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
+    int32x4_t input_val_4 =
+        vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
+    input_val_1 = vaddq_s32(input_val_1, input_zero_point_dup);
+    input_val_2 = vaddq_s32(input_val_2, input_zero_point_dup);
+    input_val_3 = vaddq_s32(input_val_3, input_zero_point_dup);
+    input_val_4 = vaddq_s32(input_val_4, input_zero_point_dup);
+
+    int32x4_t result_val_1, result_val_2, result_val_3, result_val_4;
+    MultiplyByQuantizedMultiplier4Rows(
+        input_val_1, input_val_2, input_val_3, input_val_4,
+        effective_scale_multiplier, left_shifted_one, right_shift,
+        &result_val_1, &result_val_2, &result_val_3, &result_val_4);
+
+    result_val_1 = vaddq_s32(result_val_1, output_zero_point_dup);
+    result_val_2 = vaddq_s32(result_val_2, output_zero_point_dup);
+    result_val_3 = vaddq_s32(result_val_3, output_zero_point_dup);
+    result_val_4 = vaddq_s32(result_val_4, output_zero_point_dup);
+    result_val_1 = vmaxq_s32(vminq_s32(result_val_1, max_val_dup), min_val_dup);
+    result_val_2 = vmaxq_s32(vminq_s32(result_val_2, max_val_dup), min_val_dup);
+    result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
+    result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
+
+    int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1);
+    int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2);
+    int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3);
+    int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4);
+    int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
+    int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
+    int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+    int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+    int8x16_t result = vcombine_s8(narrowed_first_half, narrowed_second_half);
+    vst1q_s8(output_data + i, result);
+  }
+
+#endif
+  for (; i < size; ++i) {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+        MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
+                                      effective_scale_shift) +
+        output_zeropoint;
+    const int32_t clamped_output =
+        std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 8488f7ae266..f4dc0c1b828 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include <memory>
 #include <type_traits>
 
+#include "third_party/eigen3/Eigen/Core"
 #include "fixedpoint/fixedpoint.h"
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
@@ -2491,6 +2492,15 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
   }
 }
 
+inline void Dequantize(const RuntimeShape& input_shape,
+                       const Eigen::half* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = Eigen::half_impl::half_to_float(input_data[i]);
+  }
+}
+
 template <typename T>
 inline void AffineQuantize(const tflite::QuantizationParams& op_params,
                            const RuntimeShape& input_shape,
diff --git a/tensorflow/lite/kernels/l2norm_test.cc b/tensorflow/lite/kernels/l2norm_test.cc
index 4cd63155b95..19f1053db30 100644
--- a/tensorflow/lite/kernels/l2norm_test.cc
+++ b/tensorflow/lite/kernels/l2norm_test.cc
@@ -172,9 +172,3 @@ TEST(L2NormOpTest, MultipleBatchInt8Test) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/local_response_norm_test.cc b/tensorflow/lite/kernels/local_response_norm_test.cc
index bd644e07f46..701da5ceb3d 100644
--- a/tensorflow/lite/kernels/local_response_norm_test.cc
+++ b/tensorflow/lite/kernels/local_response_norm_test.cc
@@ -93,9 +93,3 @@ TEST(LocalResponseNormOpTest, SmallRadius) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/log_softmax_test.cc b/tensorflow/lite/kernels/log_softmax_test.cc
index fb126295e6a..bc265915279 100644
--- a/tensorflow/lite/kernels/log_softmax_test.cc
+++ b/tensorflow/lite/kernels/log_softmax_test.cc
@@ -104,9 +104,3 @@ TEST(LogSoftmaxOpTest, CompareWithTFmini) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/logical_test.cc b/tensorflow/lite/kernels/logical_test.cc
index b3161645271..276d5d91cd1 100644
--- a/tensorflow/lite/kernels/logical_test.cc
+++ b/tensorflow/lite/kernels/logical_test.cc
@@ -104,9 +104,3 @@ TEST(LogicalTest, BroadcastLogicalAnd) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/lsh_projection_test.cc b/tensorflow/lite/kernels/lsh_projection_test.cc
index cb2724a6cce..1b75992de6d 100644
--- a/tensorflow/lite/kernels/lsh_projection_test.cc
+++ b/tensorflow/lite/kernels/lsh_projection_test.cc
@@ -115,9 +115,3 @@ TEST(LSHProjectionOpTest2, Sparse3DInputs) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 35a98c57830..84ddc2562a0 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -1957,14 +1957,14 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float ceil_clip = 0.0;
+  const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
   LayerNormLSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/false, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      /*use_projection_bias=*/false, cell_clip, proj_clip,
       {
           {n_batch, n_input},  // input tensor
 
@@ -2052,14 +2052,14 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float ceil_clip = 0.0;
+  const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
   HybridLayerNormLSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/false, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      /*use_projection_bias=*/false, cell_clip, proj_clip,
       {
           {n_batch, n_input},  // input tensor
 
@@ -2147,14 +2147,14 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float ceil_clip = 0.0;
+  const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
   HybridLayerNormLSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/false, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      /*use_projection_bias=*/false, cell_clip, proj_clip,
       {
           {n_batch, n_input},  // input tensor
 
@@ -2289,14 +2289,14 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float ceil_clip = 0.0;
+  const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
   LayerNormLSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/true, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      /*use_projection_bias=*/false, cell_clip, proj_clip,
       {
           {n_batch, n_input},  // input tensor
 
@@ -2379,14 +2379,14 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float ceil_clip = 0.0;
+  const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
   HybridLayerNormLSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/true, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      /*use_projection_bias=*/false, cell_clip, proj_clip,
       {
           {n_batch, n_input},  // input tensor
 
@@ -2470,14 +2470,14 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
   const int n_input = 5;
   const int n_cell = 4;
   const int n_output = 3;
-  const float ceil_clip = 0.0;
+  const float cell_clip = 0.0;
   const float proj_clip = 0.0;
 
   HybridLayerNormLSTMOpModel layer_norm_lstm(
       n_batch, n_input, n_cell, n_output,
       /*use_cifg=*/true, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      /*use_projection_bias=*/false, cell_clip, proj_clip,
       {
           {n_batch, n_input},  // input tensor
 
@@ -2633,9 +2633,3 @@ TEST(LSTMOpModel, InvalidTypeTest) {
 #endif
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/matrix_diag_test.cc b/tensorflow/lite/kernels/matrix_diag_test.cc
index 6b08235b8b2..298ae264433 100644
--- a/tensorflow/lite/kernels/matrix_diag_test.cc
+++ b/tensorflow/lite/kernels/matrix_diag_test.cc
@@ -102,9 +102,3 @@ TEST(MatrixDiagTest, DegenenerateCase) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/matrix_set_diag_test.cc b/tensorflow/lite/kernels/matrix_set_diag_test.cc
index a41c717f733..46b314735b3 100644
--- a/tensorflow/lite/kernels/matrix_set_diag_test.cc
+++ b/tensorflow/lite/kernels/matrix_set_diag_test.cc
@@ -124,9 +124,3 @@ TEST(MatrixSetDiagTest, Int32TestTwoDimDiag) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index 6567c8f3611..669135bcf51 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -146,9 +146,3 @@ TEST(MaximumOpTest, Int32WithBroadcastTest) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/mfcc_test.cc b/tensorflow/lite/kernels/mfcc_test.cc
index ade5bf53d11..7b5591b3b67 100644
--- a/tensorflow/lite/kernels/mfcc_test.cc
+++ b/tensorflow/lite/kernels/mfcc_test.cc
@@ -96,9 +96,3 @@ TEST(MfccOpTest, SimpleTest) {
 }  // namespace custom
 }  // namespace ops
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index 96f5a8a0e07..b6a7700aee1 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -293,9 +293,3 @@ TEST(QuantizedMulOpTest, WithBroadcastInt8) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/neg_test.cc b/tensorflow/lite/kernels/neg_test.cc
index d461ede3c48..0cdf5161628 100644
--- a/tensorflow/lite/kernels/neg_test.cc
+++ b/tensorflow/lite/kernels/neg_test.cc
@@ -72,9 +72,3 @@ TEST(NegOpModel, NegInt64) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/one_hot_test.cc b/tensorflow/lite/kernels/one_hot_test.cc
index 85438327e7e..0a437915f53 100644
--- a/tensorflow/lite/kernels/one_hot_test.cc
+++ b/tensorflow/lite/kernels/one_hot_test.cc
@@ -174,9 +174,3 @@ TEST(OneHotOpTest, Int64Indices) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index b40bb45b76a..9b181e465cf 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -275,9 +275,3 @@ TEST(PackOpTest, Int8MultilDimensions) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index ca246e9c346..2dc2d92f5d3 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -784,9 +784,3 @@ TEST_F(QuantizedPadV2OpTest, Int8AdvancedDynamicValuedTest) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index 56e4b3a6414..d1f983344b2 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -1087,9 +1087,3 @@ TEST(FloatPoolingOpTest, L2PoolPaddingValidSlide1) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/pow_test.cc b/tensorflow/lite/kernels/pow_test.cc
index 60d674e9779..cbd15b301b1 100644
--- a/tensorflow/lite/kernels/pow_test.cc
+++ b/tensorflow/lite/kernels/pow_test.cc
@@ -109,9 +109,3 @@ TEST(PowOpModel, BroadcastTest) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/quantize.cc b/tensorflow/lite/kernels/quantize.cc
index 02d9769b416..066bc72618b 100644
--- a/tensorflow/lite/kernels/quantize.cc
+++ b/tensorflow/lite/kernels/quantize.cc
@@ -117,12 +117,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       const int32_t size =
           MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
       if (output->type == kTfLiteInt8) {
-        reference_ops::Requantize<int8_t, int8_t>(
+        optimized_ops::Requantize<int8_t, int8_t>(
             GetTensorData<int8_t>(input), size, data->output_multiplier,
             data->output_shift, input->params.zero_point,
             output->params.zero_point, GetTensorData<int8_t>(output));
       } else if (output->type == kTfLiteUInt8) {
-        reference_ops::Requantize<int8_t, uint8_t>(
+        optimized_ops::Requantize<int8_t, uint8_t>(
             GetTensorData<int8_t>(input), size, data->output_multiplier,
             data->output_shift, input->params.zero_point,
             output->params.zero_point, GetTensorData<uint8_t>(output));
@@ -139,12 +139,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       const int32_t size =
           MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
       if (output->type == kTfLiteInt8) {
-        reference_ops::Requantize<uint8_t, int8_t>(
+        optimized_ops::Requantize<uint8_t, int8_t>(
             GetTensorData<uint8_t>(input), size, data->output_multiplier,
             data->output_shift, input->params.zero_point,
             output->params.zero_point, GetTensorData<int8_t>(output));
       } else if (output->type == kTfLiteUInt8) {
-        reference_ops::Requantize<uint8_t, uint8_t>(
+        optimized_ops::Requantize<uint8_t, uint8_t>(
             GetTensorData<uint8_t>(input), size, data->output_multiplier,
             data->output_shift, input->params.zero_point,
             output->params.zero_point, GetTensorData<uint8_t>(output));
diff --git a/tensorflow/lite/kernels/quantize_test.cc b/tensorflow/lite/kernels/quantize_test.cc
index 66d3769806d..b381c041859 100644
--- a/tensorflow/lite/kernels/quantize_test.cc
+++ b/tensorflow/lite/kernels/quantize_test.cc
@@ -174,6 +174,21 @@ TEST(QuantizeOpTest, Int8Uint8SameScale) {
       ElementsAreArray({128, 129, 130, 131, 132, 133, 134, 135, 136, 137}));
 }
 
+// Same as previous test, except more data to hit the neon path.
+TEST(QuantizeOpTest, Int8UInt8SameScaleNeonPath) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 4, 5}, -127, 128},
+                    {TensorType_UINT8, {1, 1, 4, 5}, -127, 128});
+
+  // Input will quantized to {0,1,2,3,4,5,6,7,8,9,9,8,7,6,5,4,3,2,1,0}.
+  m.SetInputAndQuantize<int8_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
+                        137, 136, 135, 134, 133, 132, 131, 130, 129, 128}));
+}
+
 //  Input scale 1.000000, output scale 0.500000, input zeropoint -1, output
 //  zeropoint 127
 TEST(QuantizeOpTest, Int8Uint8SmallerScale) {
@@ -188,6 +203,21 @@ TEST(QuantizeOpTest, Int8Uint8SmallerScale) {
       ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147}));
 }
 
+// Same as previous test, except more data to hit the neon path.
+TEST(QuantizeOpTest, Int8Uint8SmallerScaleNeonPath) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 4, 5}, -127, 128},
+                    {TensorType_UINT8, {1, 1, 4, 5}, -63.5, 64});
+
+  // Input will quantized to {0,1,2,3,4,5,6,7,8,9,9,8,7,6,5,4,3,2,1,0}.
+  m.SetInputAndQuantize<int8_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
+                        147, 145, 143, 141, 139, 137, 135, 133, 131, 129}));
+}
+
 //  Input scale 1.000000, output scale 2.000000, input zeropoint -1, output
 //  zeropoint 127
 TEST(QuantizeOpTest, Int8Uint8LargerScale) {
@@ -202,6 +232,21 @@ TEST(QuantizeOpTest, Int8Uint8LargerScale) {
       ElementsAreArray({128, 128, 129, 129, 130, 130, 131, 131, 132, 132}));
 }
 
+// Same as previous test, except more data to hit the neon path.
+TEST(QuantizeOpTest, Int8Uint8LargerScaleNeonPath) {
+  QuantizeOpModel m({TensorType_INT8, {1, 1, 4, 5}, -127, 128},
+                    {TensorType_UINT8, {1, 1, 4, 5}, -254, 256});
+
+  // Input will quantized to {0,1,2,3,4,5,6,7,8,9,9,8,7,6,5,4,3,2,1,0}.
+  m.SetInputAndQuantize<int8_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput<uint8_t>(),
+      ElementsAreArray({128, 128, 129, 129, 130, 130, 131, 131, 132, 132,
+                        132, 132, 131, 131, 130, 130, 129, 129, 128, 128}));
+}
+
 // input scale 0.500000, output scale 0.500000, input zeropoint 127, output
 // zeropoint -1
 TEST(QuantizeOpTest, UInt8Int8SameScale128Diff) {
@@ -215,6 +260,21 @@ TEST(QuantizeOpTest, UInt8Int8SameScale128Diff) {
               ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
 }
 
+// Same as previous test, except more data to hit the neon path.
+TEST(QuantizeOpTest, UInt8Int8SameScale128DiffNeonPath) {
+  QuantizeOpModel m({TensorType_UINT8, {1, 1, 4, 5}, -127, 128},
+                    {TensorType_INT8, {1, 1, 4, 5}, -127, 128});
+
+  // Input will quantized to {128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
+  // 137, 136, 135, 134, 133, 132, 131, 130, 129, 128}.
+  m.SetInputAndQuantize<uint8_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+                                9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
+}
+
 // input scale 0.500000, output scale 0.500000, input zeropoint 0, output
 // zeropoint -1
 TEST(QuantizeOpTest, Uint8Int8SameScaleArbitraryDiff) {
@@ -228,6 +288,21 @@ TEST(QuantizeOpTest, Uint8Int8SameScaleArbitraryDiff) {
               ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19}));
 }
 
+// Same as previous test, except more data to hit the neon path.
+TEST(QuantizeOpTest, Uint8Int8SameScaleArbitraryDiffNeonPath) {
+  QuantizeOpModel m({TensorType_UINT8, {1, 1, 4, 5}, 0, 127.5},
+                    {TensorType_INT8, {1, 1, 4, 5}, -63.5, 64});
+
+  // Input will quantized to
+  // {2,4,6,8,10,12,14,16,18,20,20,18,16,14,12,10,8,6,4,2}.
+  m.SetInputAndQuantize<uint8_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({1,  3,  5,  7,  9,  11, 13, 15, 17, 19,
+                                19, 17, 15, 13, 11, 9,  7,  5,  3,  1}));
+}
+
 // input scale 0.500000, output scale 1.000000, input zeropoint 0, output
 // zeropoint -1
 TEST(QuantizeOpTest, Uint8Int8LargerScale) {
@@ -241,6 +316,21 @@ TEST(QuantizeOpTest, Uint8Int8LargerScale) {
               ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
 }
 
+// Same as previous test, except more data to hit the neon path.
+TEST(QuantizeOpTest, Uint8Int8LargerScaleNeonPath) {
+  QuantizeOpModel m({TensorType_UINT8, {1, 1, 4, 5}, 0, 127.5},
+                    {TensorType_INT8, {1, 1, 4, 5}, -127, 128});
+
+  // Input will quantized to
+  // {2,4,6,8,10,12,14,16,18,20,20,18,16,14,12,10,8,6,4,2}.
+  m.SetInputAndQuantize<uint8_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+                                9, 8, 7, 6, 5, 4, 3, 2, 1, 0}));
+}
+
 // input scale 1.000000, output scale 0.500000, input zeropoint 0, output
 // zeropoint -1
 TEST(QuantizeOpTest, Uint8Int8SmallerScale) {
@@ -256,9 +346,3 @@ TEST(QuantizeOpTest, Uint8Int8SmallerScale) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/range_test.cc b/tensorflow/lite/kernels/range_test.cc
index e1d4aaba433..7de4fe3cb76 100644
--- a/tensorflow/lite/kernels/range_test.cc
+++ b/tensorflow/lite/kernels/range_test.cc
@@ -112,9 +112,3 @@ TEST(RangeOpModel, FloatNegativeDelta) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
index 3c31fc58669..f3dc97126ba 100644
--- a/tensorflow/lite/kernels/rank_test.cc
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -83,9 +83,3 @@ TEST(RankOpTest, EmptyTensor) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index dd852b90aef..a7fa3fcd054 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -1208,9 +1208,3 @@ TEST(DynamicAnyOpTest, Scalar) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 8543600ee75..d9790e4ebbf 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -217,7 +217,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version */ 1,
-             /* max_version */ 4);
+             /* max_version */ 5);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
index eb05eb6d4e1..eed1f3b37bd 100644
--- a/tensorflow/lite/kernels/reshape_test.cc
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -231,9 +231,3 @@ INSTANTIATE_TEST_SUITE_P(VariedShapeSpec, ReshapeOpTest,
                                            kAsTensor));
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index b7b7fcd4157..d9690ea9ca8 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -427,9 +427,3 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResizeInt8) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
index 4d4cec9101c..47e2b24358c 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -438,9 +438,3 @@ TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt8) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/reverse_sequence_test.cc b/tensorflow/lite/kernels/reverse_sequence_test.cc
index e81f1380290..f1fefdd4856 100644
--- a/tensorflow/lite/kernels/reverse_sequence_test.cc
+++ b/tensorflow/lite/kernels/reverse_sequence_test.cc
@@ -203,9 +203,3 @@ TEST(ReverseSequenceOpTest, Int16BatchDimIsGreater) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
index 9bc0c24b64c..02101ab172c 100644
--- a/tensorflow/lite/kernels/reverse_test.cc
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -191,9 +191,3 @@ TEST(ReverseOpTest, Int16MultiDimensions) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/round_test.cc b/tensorflow/lite/kernels/round_test.cc
index 37304fb2309..baa614347d2 100644
--- a/tensorflow/lite/kernels/round_test.cc
+++ b/tensorflow/lite/kernels/round_test.cc
@@ -66,9 +66,3 @@ TEST(RoundOpTest, MultiDims) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/select_test.cc b/tensorflow/lite/kernels/select_test.cc
index d7cadeb51eb..c1602abf942 100644
--- a/tensorflow/lite/kernels/select_test.cc
+++ b/tensorflow/lite/kernels/select_test.cc
@@ -161,9 +161,3 @@ TEST(SelectOpTest, RankZeroSelectInt32) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/shape_test.cc b/tensorflow/lite/kernels/shape_test.cc
index 0c13ff45b0a..6a7dad4d3e0 100644
--- a/tensorflow/lite/kernels/shape_test.cc
+++ b/tensorflow/lite/kernels/shape_test.cc
@@ -87,9 +87,3 @@ TEST(ShapeOpTest, EmptyTensor) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/skip_gram_test.cc b/tensorflow/lite/kernels/skip_gram_test.cc
index d4430b8a343..12d631660ee 100644
--- a/tensorflow/lite/kernels/skip_gram_test.cc
+++ b/tensorflow/lite/kernels/skip_gram_test.cc
@@ -249,9 +249,3 @@ TEST(SkipGramTest, TestInputWithExtraSpace) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 41478029abc..11f8d9f7f5c 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -241,9 +241,3 @@ TEST(SliceOpTest, SliceString) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/softmax_test.cc b/tensorflow/lite/kernels/softmax_test.cc
index eb9d7c1d9de..79f3608fe9a 100644
--- a/tensorflow/lite/kernels/softmax_test.cc
+++ b/tensorflow/lite/kernels/softmax_test.cc
@@ -136,9 +136,3 @@ TEST(SoftmaxOpTest, CompareWithTFminiBetaNotEq1) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index 52a77984d93..72432a1b00c 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -334,9 +334,3 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
index 58665fc9d83..ad2f95d82ba 100644
--- a/tensorflow/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -103,9 +103,3 @@ TEST(SpaceToDepthOpModel, Int64) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/sparse_to_dense_test.cc b/tensorflow/lite/kernels/sparse_to_dense_test.cc
index a381dd7e565..4a5ce6a36b5 100644
--- a/tensorflow/lite/kernels/sparse_to_dense_test.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense_test.cc
@@ -162,9 +162,3 @@ TEST(SparseToDenseOpModelTest, Int64IndexTest) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/split_test.cc b/tensorflow/lite/kernels/split_test.cc
index fa313d4b18f..2812735fa58 100644
--- a/tensorflow/lite/kernels/split_test.cc
+++ b/tensorflow/lite/kernels/split_test.cc
@@ -204,9 +204,3 @@ TEST(SplitOpTest, NegativeAxis) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/split_v_test.cc b/tensorflow/lite/kernels/split_v_test.cc
index 27fed63f0ee..d4be59fce53 100644
--- a/tensorflow/lite/kernels/split_v_test.cc
+++ b/tensorflow/lite/kernels/split_v_test.cc
@@ -202,9 +202,3 @@ TEST(SplitVOpTest, TwoDimensionalInt16) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/squared_difference_test.cc b/tensorflow/lite/kernels/squared_difference_test.cc
index 32bcab3b87f..249590f37e9 100644
--- a/tensorflow/lite/kernels/squared_difference_test.cc
+++ b/tensorflow/lite/kernels/squared_difference_test.cc
@@ -149,9 +149,3 @@ TEST(IntegerSquaredDifferenceOpTest, IntegerType_WithBroadcast) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/squeeze_test.cc b/tensorflow/lite/kernels/squeeze_test.cc
index 4a02a8ee7e1..a4c7a565154 100644
--- a/tensorflow/lite/kernels/squeeze_test.cc
+++ b/tensorflow/lite/kernels/squeeze_test.cc
@@ -116,9 +116,3 @@ TEST(FloatSqueezeOpTest, SqueezeAllDims) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index cac9e1672f8..b8a1b9ba704 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -591,9 +591,3 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1int8) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index 3c19678b20f..24b554f087b 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -417,8 +417,3 @@ TEST(QuantizedSubOpModel, QuantizedTestsReluActivationBroadcastInt16) {
 
 }  // namespace
 }  // namespace tflite
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/svdf_test.cc b/tensorflow/lite/kernels/svdf_test.cc
index c420260bf51..cf975deeba6 100644
--- a/tensorflow/lite/kernels/svdf_test.cc
+++ b/tensorflow/lite/kernels/svdf_test.cc
@@ -457,9 +457,3 @@ TEST_F(SVDFOpTest, BlackBoxTestHybridRank2Int8) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/test_main.cc b/tensorflow/lite/kernels/test_main.cc
new file mode 100644
index 00000000000..d8e12297fea
--- /dev/null
+++ b/tensorflow/lite/kernels/test_main.cc
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+
+namespace {
+
+void InitKernelTest(int* argc, char** argv) {
+  bool use_nnapi = false;
+  std::vector<tflite::Flag> flags = {
+      tflite::Flag::CreateFlag("use_nnapi", &use_nnapi, "Use NNAPI"),
+  };
+  tflite::Flags::Parse(argc, const_cast<const char**>(argv), flags);
+
+  if (use_nnapi) {
+    tflite::SingleOpModel::SetForceUseNnapi(true);
+  }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  InitKernelTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index dd8d9ed2183..afed5eff40f 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -14,14 +14,23 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/test_util.h"
 
-#include "tensorflow/lite/version.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/version.h"
 
 namespace tflite {
 
 using ::testing::FloatNear;
 using ::testing::Matcher;
 
+namespace {
+
+// Whether to enable (global) use of NNAPI. Note that this will typically
+// be set via a command-line flag.
+static bool force_use_nnapi = false;
+
+}  // namespace
+
 std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
                                            float max_abs_error) {
   std::vector<Matcher<float>> matchers;
@@ -138,6 +147,11 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
       << "Cannot allocate tensors";
   interpreter_->ResetVariableTensors();
 
+  if (force_use_nnapi) {
+    // TODO(b/124505407): Check the result and fail accordingly.
+    interpreter_->ModifyGraphWithDelegate(NnApiDelegate());
+  }
+
   // Modify delegate with function.
   if (apply_delegate_fn_) {
     apply_delegate_fn_(interpreter_.get());
@@ -146,6 +160,11 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 
 void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); }
 
+// static
+void SingleOpModel::SetForceUseNnapi(bool use_nnapi) {
+  force_use_nnapi = use_nnapi;
+}
+
 int32_t SingleOpModel::GetTensorSize(int index) const {
   TfLiteTensor* t = interpreter_->tensor(index);
   CHECK(t);
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index e00c24a20af..423f96c85f4 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -335,6 +335,9 @@ class SingleOpModel {
     resolver_ = std::move(resolver);
   }
 
+  // Enables NNAPI delegate application during interpreter creation.
+  static void SetForceUseNnapi(bool use_nnapi);
+
  protected:
   int32_t GetTensorSize(int index) const;
 
diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc
index 2eabebe69b5..67bd237dcb9 100644
--- a/tensorflow/lite/kernels/tile_test.cc
+++ b/tensorflow/lite/kernels/tile_test.cc
@@ -158,9 +158,3 @@ TEST(TileTest, Int64Matrix64Multipliers) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/topk_v2_test.cc b/tensorflow/lite/kernels/topk_v2_test.cc
index 0097ae2f9ae..48aedf4f48b 100644
--- a/tensorflow/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/lite/kernels/topk_v2_test.cc
@@ -163,9 +163,3 @@ TEST(TopKV2OpTest, TypeInt64) {
 }
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
index 516bae87a1d..d2db462597b 100644
--- a/tensorflow/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -357,9 +357,3 @@ INSTANTIATE_TEST_SUITE_P(
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index aa8d2372101..f5d8f314d6e 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -354,9 +354,3 @@ TEST(TransposeTest, ComplexTestWithReorderDynamicTensor) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index f1793c13a72..d88f4265231 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -34,6 +34,13 @@ namespace ops {
 namespace builtin {
 namespace unidirectional_sequence_lstm {
 
+struct OpData {
+  // If the lstm is layer norm.
+  bool is_layer_norm_lstm;
+  // The scratch tensor index.
+  int scratch_tensor_index;
+};
+
 // Input Tensors of size {max_time, n_batch, n_input}
 constexpr int kInputTensor = 0;
 
@@ -71,6 +78,13 @@ constexpr int kInputActivationStateTensor = 18;
 // Cell state tensor of size {n_batch, n_cell}
 constexpr int kInputCellStateTensor = 19;
 
+// Layer norm coefficient tensors of size {n_cell}, representing a diagonal
+// matrix.
+constexpr int kInputLayerNormCoefficientsTensor = 20;   // Optional
+constexpr int kForgetLayerNormCoefficientsTensor = 21;  // Optional
+constexpr int kCellLayerNormCoefficientsTensor = 22;    // Optional
+constexpr int kOutputLayerNormCoefficientsTensor = 23;  // Optional
+
 // Output tensors.
 constexpr int kOutputTensor = 0;
 
@@ -87,19 +101,21 @@ enum TemporaryTensor {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* scratch_tensor_index = new int();
-  context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index);
-  return scratch_tensor_index;
+  auto* op_data = new OpData();
+  context->AddTensors(context, kNumTemporaryTensors,
+                      &op_data->scratch_tensor_index);
+  return op_data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<int*>(buffer);
+  delete reinterpret_cast<OpData*>(buffer);
 }
 
 // Check that input tensor dimensions matches with each other.
 TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                                         TfLiteNode* node, int n_input,
-                                        int n_output, int n_cell) {
+                                        int n_output, int n_cell,
+                                        bool is_layer_norm_lstm) {
   const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
 
   // Making sure clipping parameters have valid values.
@@ -242,6 +258,48 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
       ((projection_weights != nullptr) || (projection_bias == nullptr));
   TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
 
+  if (is_layer_norm_lstm) {
+    const TfLiteTensor* input_layer_norm_coefficients = GetOptionalInputTensor(
+        context, node, kInputLayerNormCoefficientsTensor);
+    if (use_cifg) {
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients, nullptr);
+    } else {
+      TF_LITE_ENSURE(context, input_layer_norm_coefficients != nullptr);
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->data[0],
+                        n_cell);
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->type,
+                        kTfLiteFloat32);
+    }
+
+    const TfLiteTensor* forget_layer_norm_coefficients =
+        GetInput(context, node, kForgetLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE(context, forget_layer_norm_coefficients != nullptr);
+    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->type,
+                      kTfLiteFloat32);
+
+    const TfLiteTensor* cell_layer_norm_coefficients =
+        GetInput(context, node, kCellLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE(context, cell_layer_norm_coefficients != nullptr);
+    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->type,
+                      kTfLiteFloat32);
+
+    const TfLiteTensor* output_layer_norm_coefficients =
+        GetInput(context, node, kOutputLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE(context, output_layer_norm_coefficients != nullptr);
+    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->type,
+                      kTfLiteFloat32);
+  }
+
   return kTfLiteOk;
 }
 
@@ -249,11 +307,30 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 // Allocate a temporary scratch tensor. Also check that the sizes of the input
 // tensors match each other.
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  const int scratch_tensor_index = op_data->scratch_tensor_index;
 
   // Check we have all the inputs and outputs we need.
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 20);
+  bool is_layer_norm_lstm = false;
+  if (node->inputs->size == 24) {
+    const TfLiteTensor* forget_layer_norm_coefficients = GetOptionalInputTensor(
+        context, node, kForgetLayerNormCoefficientsTensor);
+    if (forget_layer_norm_coefficients == nullptr) {
+      is_layer_norm_lstm = false;
+    } else {
+      is_layer_norm_lstm = true;
+    }
+  } else if (node->inputs->size == 20) {
+    // This is deprecated and is only kept here for backward compatibility.
+    is_layer_norm_lstm = false;
+  } else {
+    context->ReportError(
+        context, "The LSTM Full kernel expects 20 or 24 inputs. Got %d inputs",
+        node->inputs->size);
+    return kTfLiteError;
+  }
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  op_data->is_layer_norm_lstm = is_layer_norm_lstm;
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
@@ -281,8 +358,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input,
-                                                        n_output, n_cell));
+  TF_LITE_ENSURE_OK(context,
+                    CheckInputTensorDimensions(context, node, n_input, n_output,
+                                               n_cell, is_layer_norm_lstm));
 
   // Get the pointer to output, activation_state and cell_state buffer tensors.
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -310,7 +388,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   } else {
     node->temporaries = TfLiteIntArrayCreate(1);
   }
-  node->temporaries->data[0] = *scratch_tensor_index;
+  node->temporaries->data[0] = scratch_tensor_index;
 
   // Create a scratch buffer tensor.
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
@@ -336,7 +414,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Allocate temporary tensors to store quantized values of input,
     // activation_state and cell_state tensors.
     node->temporaries->data[kInputQuantized] =
-        *scratch_tensor_index + kInputQuantized;
+        scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
         GetTemporary(context, node, kInputQuantized);
     input_quantized->type = input_to_output_weights->type;
@@ -347,7 +425,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        input_quantized_size));
     }
     node->temporaries->data[kOutputStateQuantized] =
-        *scratch_tensor_index + kOutputStateQuantized;
+        scratch_tensor_index + kOutputStateQuantized;
     TfLiteTensor* activation_state_quantized =
         GetTemporary(context, node, kOutputStateQuantized);
     activation_state_quantized->type = input_to_output_weights->type;
@@ -361,7 +439,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                          activation_state_quantized_size));
     }
     node->temporaries->data[kCellStateQuantized] =
-        *scratch_tensor_index + kCellStateQuantized;
+        scratch_tensor_index + kCellStateQuantized;
     TfLiteTensor* cell_state_quantized =
         GetTemporary(context, node, kCellStateQuantized);
     cell_state_quantized->type = input_to_output_weights->type;
@@ -380,7 +458,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // different matrices (which requires multiplying the scaling factors with
     // the scaling factor of the matrix).
     node->temporaries->data[kScalingFactors] =
-        *scratch_tensor_index + kScalingFactors;
+        scratch_tensor_index + kScalingFactors;
     TfLiteTensor* scaling_factors =
         GetTemporary(context, node, kScalingFactors);
     scaling_factors->type = kTfLiteFloat32;
@@ -393,7 +471,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        scaling_factors_size));
     }
     node->temporaries->data[kProductScalingFactors] =
-        *scratch_tensor_index + kProductScalingFactors;
+        scratch_tensor_index + kProductScalingFactors;
     TfLiteTensor* prod_scaling_factors =
         GetTemporary(context, node, kProductScalingFactors);
     prod_scaling_factors->type = kTfLiteFloat32;
@@ -410,7 +488,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Allocate a temporary tensor to store the recovered cell weights. Since
     // this is used for diagonal matrices, only need to store n_cell values.
     node->temporaries->data[kRecoveredCellWeights] =
-        *scratch_tensor_index + kRecoveredCellWeights;
+        scratch_tensor_index + kRecoveredCellWeights;
     TfLiteTensor* recovered_cell_weights =
         GetTemporary(context, node, kRecoveredCellWeights);
     recovered_cell_weights->type = kTfLiteFloat32;
@@ -432,6 +510,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
           node->builtin_data);
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  const bool is_layer_norm_lstm = op_data->is_layer_norm_lstm;
   const bool time_major = params->time_major;
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
 
@@ -481,6 +561,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* cell_state =
       GetVariableInput(context, node, kInputCellStateTensor);
 
+  const TfLiteTensor* input_layer_norm_coefficients =
+      is_layer_norm_lstm ? GetOptionalInputTensor(
+                               context, node, kInputLayerNormCoefficientsTensor)
+                         : nullptr;
+  const TfLiteTensor* forget_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kForgetLayerNormCoefficientsTensor)
+          : nullptr;
+  const TfLiteTensor* cell_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kCellLayerNormCoefficientsTensor)
+          : nullptr;
+  const TfLiteTensor* output_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kOutputLayerNormCoefficientsTensor)
+          : nullptr;
+
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Copy out the LSTM specific params so they can be passed in the function.
@@ -497,10 +594,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           recurrent_to_input_weights, recurrent_to_forget_weights,
           recurrent_to_cell_weights, recurrent_to_output_weights,
           cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          /*input_layer_norm_coefficients=*/nullptr,
-          /*forget_layer_norm_coefficients=*/nullptr,
-          /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr,
+          input_layer_norm_coefficients, forget_layer_norm_coefficients,
+          cell_layer_norm_coefficients, output_layer_norm_coefficients,
           /*aux_input=*/nullptr,
           /*aux_input_to_input_weights=*/nullptr,
           /*aux_input_to_forget_weights=*/nullptr,
@@ -529,10 +624,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           recurrent_to_input_weights, recurrent_to_forget_weights,
           recurrent_to_cell_weights, recurrent_to_output_weights,
           cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          /*input_layer_norm_coefficients=*/nullptr,
-          /*forget_layer_norm_coefficients=*/nullptr,
-          /*cell_layer_norm_coefficients=*/nullptr,
-          /*output_layer_norm_coefficients=*/nullptr,
+          input_layer_norm_coefficients, forget_layer_norm_coefficients,
+          cell_layer_norm_coefficients, output_layer_norm_coefficients,
           /*aux_input=*/nullptr,
           /*aux_input_to_input_weights=*/nullptr,
           /*aux_input_to_forget_weights=*/nullptr,
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index c9f9f158fd2..041a4160382 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -37,7 +37,8 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
                             bool use_projection_bias, float cell_clip,
                             float proj_clip,
                             const std::vector<std::vector<int>>& input_shapes,
-                            const TensorType& weights_type = TensorType_FLOAT32)
+                            const TensorType& weights_type = TensorType_FLOAT32,
+                            bool is_layer_norm = false)
       : n_batch_(n_batch),
         n_input_(n_input),
         n_cell_(n_cell),
@@ -108,6 +109,22 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
         AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}},
                  /*is_variable=*/true);
 
+    // Layer norm weights.
+    if (is_layer_norm) {
+      if (use_cifg) {
+        input_layer_norm_coefficients_ = AddNullInput();
+      } else {
+        input_layer_norm_coefficients_ =
+            AddLayerNormCoeffsTensor(20, input_shapes);
+      }
+      forget_layer_norm_coefficients_ =
+          AddLayerNormCoeffsTensor(21, input_shapes);
+      cell_layer_norm_coefficients_ =
+          AddLayerNormCoeffsTensor(22, input_shapes);
+      output_layer_norm_coefficients_ =
+          AddLayerNormCoeffsTensor(23, input_shapes);
+    }
+
     output_ = AddOutput(TensorType_FLOAT32);
 
     SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
@@ -187,6 +204,22 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
     PopulateTensor(projection_bias_, f);
   }
 
+  void SetInputLayerNormCoefficients(std::vector<float> f) {
+    PopulateTensor(input_layer_norm_coefficients_, f);
+  }
+
+  void SetForgetLayerNormCoefficients(std::vector<float> f) {
+    PopulateTensor(forget_layer_norm_coefficients_, f);
+  }
+
+  void SetCellLayerNormCoefficients(std::vector<float> f) {
+    PopulateTensor(cell_layer_norm_coefficients_, f);
+  }
+
+  void SetOutputLayerNormCoefficients(std::vector<float> f) {
+    PopulateTensor(output_layer_norm_coefficients_, f);
+  }
+
   void SetInput(int offset, const float* begin, const float* end) {
     PopulateTensor(input_, offset, const_cast<float*>(begin),
                    const_cast<float*>(end));
@@ -227,6 +260,11 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
   int input_activation_state_;
   int input_cell_state_;
 
+  int input_layer_norm_coefficients_;
+  int forget_layer_norm_coefficients_;
+  int cell_layer_norm_coefficients_;
+  int output_layer_norm_coefficients_;
+
   int output_;
 
   int n_batch_;
@@ -234,6 +272,16 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
   int n_cell_;
   int n_output_;
   int sequence_length_;
+
+ private:
+  int AddLayerNormCoeffsTensor(
+      int tensor_index, const std::vector<std::vector<int>>& input_shapes) {
+    if (input_shapes[tensor_index][0] != 0) {
+      return AddInput(TensorType_FLOAT32);
+    } else {
+      return AddNullInput();
+    }
+  }
 };
 
 // The hybrid model has quantized weights.
@@ -2403,11 +2451,280 @@ TEST_F(NoCifgPeepholeProjectionAndBiasClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
+class LayerNormUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
+ public:
+  LayerNormUnidirectionalLSTMOpModel(
+      int n_batch, int n_input, int n_cell, int n_output, int sequence_length,
+      bool time_major, bool use_cifg, bool use_peephole,
+      bool use_projection_weights, bool use_projection_bias, float cell_clip,
+      float proj_clip, const std::vector<std::vector<int>>& input_shapes,
+      const TensorType& weights_type = TensorType_FLOAT32)
+      : UnidirectionalLSTMOpModel(
+            n_batch, n_input, n_cell, n_output, sequence_length, time_major,
+            use_cifg, use_peephole, use_projection_weights, use_projection_bias,
+            cell_clip, proj_clip, input_shapes, TensorType_FLOAT32, true) {}
+};
+
+class BaseLayerNormLstmTest : public ::testing::Test {
+ protected:
+  // Weights of the LSTM model. Some are optional.
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> projection_weights_;
+  std::vector<float> projection_bias_;
+  std::vector<float> input_layer_norm_coefficients_;
+  std::vector<float> forget_layer_norm_coefficients_;
+  std::vector<float> cell_layer_norm_coefficients_;
+  std::vector<float> output_layer_norm_coefficients_;
+
+  // LSTM input is stored as num_batch x num_inputs vector.
+  std::vector<std::vector<float>> lstm_input_;
+  // LSTM output is stored as num_batch x num_outputs vector.
+  std::vector<std::vector<float>> lstm_golden_output_;
+
+  // Compares output up to tolerance to the result of the lstm given the input.
+  void VerifyGoldens(const std::vector<std::vector<float>>& input,
+                     const std::vector<std::vector<float>>& output,
+                     UnidirectionalLSTMOpModel* lstm, float tolerance = 1e-5) {
+    const int num_batches = input.size();
+    EXPECT_GT(num_batches, 0);
+    const int num_inputs = lstm->num_inputs();
+    EXPECT_GT(num_inputs, 0);
+    const int input_sequence_size = input[0].size() / num_inputs;
+    EXPECT_GT(input_sequence_size, 0);
+    // Feed the whole sequence as input.
+    for (int i = 0; i < input_sequence_size; ++i) {
+      for (int b = 0; b < num_batches; ++b) {
+        const float* batch_start = input[b].data() + i * num_inputs;
+        const float* batch_end = batch_start + num_inputs;
+
+        lstm->SetInput(((i * num_batches) + b) * num_inputs, batch_start,
+                       batch_end);
+      }
+    }
+
+    lstm->Invoke();
+
+    const int num_outputs = lstm->num_outputs();
+    EXPECT_GT(num_outputs, 0);
+    std::vector<float> expected;
+
+    for (int i = 0; i < input_sequence_size; ++i) {
+      for (int b = 0; b < num_batches; ++b) {
+        const float* golden_start_batch = output[b].data() + i * num_outputs;
+        const float* golden_end_batch = golden_start_batch + num_outputs;
+
+        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+      }
+    }
+    EXPECT_THAT(lstm->GetOutput(),
+                ElementsAreArray(ArrayFloatNear(expected, tolerance)));
+  }
+};
+
+class CifgPeepholeNoProjectionNoClippingLayerNormLstmTest
+    : public BaseLayerNormLstmTest {
+  void SetUp() override {
+    input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726,
+                              0.05100781,  0.04717243,  0.48944736,
+                              -0.38535351, -0.17212132};
+
+    input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988,
+                                -0.3633365,  -0.22755712, 0.28253698,
+                                0.24407166,  0.33826375};
+
+    input_to_output_weights_ = {0.10725588,  -0.02335852, -0.55932593,
+                                -0.09426838, -0.44257352, 0.54939759,
+                                0.01533556,  0.42751634};
+    cell_gate_bias_ = {0., 0., 0., 0.};
+    forget_gate_bias_ = {1., 1., 1., 1.};
+    output_gate_bias_ = {0., 0., 0., 0.};
+
+    recurrent_to_cell_weights_ = {
+        0.54066205,  -0.32668582, -0.43562764, -0.56094903,
+        0.42957711,  0.01841056,  -0.32764608, -0.33027974,
+        -0.10826075, 0.20675004,  0.19069612,  -0.03026325,
+        -0.54532051, 0.33003211,  0.44901288,  0.21193194};
+
+    recurrent_to_forget_weights_ = {
+        -0.13832897, -0.0515101,  -0.2359007, -0.16661474,
+        -0.14340827, 0.36986142,  0.23414481, 0.55899,
+        0.10798943,  -0.41174671, 0.17751795, -0.34484994,
+        -0.35874045, -0.11352962, 0.27268326, 0.54058349};
+
+    recurrent_to_output_weights_ = {
+        0.41613156, 0.42610586,  -0.16495961, -0.5663873,
+        0.30579174, -0.05115908, -0.33941799, 0.23364776,
+        0.11178309, 0.09481031,  -0.26424935, 0.46261835,
+        0.50248802, 0.26114327,  -0.43736315, 0.33149987};
+
+    cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408,
+                               0.31544167};
+    cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703,
+                               -0.77109635};
+
+    input_layer_norm_coefficients_ = {0.1, 0.2, 0.3, 0.5};
+    forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+
+    lstm_input_ = {{2., 3., 3., 4., 1., 1.}};
+    lstm_golden_output_ = {{-0.102089, 0.00653987, 0.0515139, -0.0630045,
+                            -0.173317, 0.0109206, 0.0903292, -0.109497,
+                            -0.23827, 0.0119514, 0.119525, -0.12748}};
+  }
+};
+
+TEST_F(CifgPeepholeNoProjectionNoClippingLayerNormLstmTest,
+       LayerNormLstmBlackBoxTest) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  LayerNormUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {0},       // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      });
+
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetForgetLayerNormCoefficients(forget_layer_norm_coefficients_);
+  lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  lstm.SetOutputLayerNormCoefficients(output_layer_norm_coefficients_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
+
+TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest,
+       NonLayerNormLstmBlackBoxTest) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  LayerNormUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {0},  // input_layer_norm_coefficient tensor
+          {0},  // forget_layer_norm_coefficient tensor
+          {0},  // cell_layer_norm_coefficient tensor
+          {0},  // output_layer_norm_coefficient tensor
+      });
+
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
+
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
index de1f7818bd0..7e520ee9739 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -440,9 +440,3 @@ TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTestInt8) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index 487fc95ea88..fb38b50dd99 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -259,9 +259,3 @@ TEST(UnpackOpTest, Int8ThreeDimensionsOutputs) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/where_test.cc b/tensorflow/lite/kernels/where_test.cc
index 89bd7c43646..070f15a3ac1 100644
--- a/tensorflow/lite/kernels/where_test.cc
+++ b/tensorflow/lite/kernels/where_test.cc
@@ -153,9 +153,3 @@ TEST(WhereOpTest, SelectFromRank3Tensor3) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/while_test.cc b/tensorflow/lite/kernels/while_test.cc
index 9946b4a3280..a3a80ea6f50 100644
--- a/tensorflow/lite/kernels/while_test.cc
+++ b/tensorflow/lite/kernels/while_test.cc
@@ -86,9 +86,3 @@ TEST_F(WhileTest, TestPadLoop) {
 
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/zeros_like_test.cc b/tensorflow/lite/kernels/zeros_like_test.cc
index f895b195e82..c084e207188 100644
--- a/tensorflow/lite/kernels/zeros_like_test.cc
+++ b/tensorflow/lite/kernels/zeros_like_test.cc
@@ -86,9 +86,3 @@ TEST(ZerosLikeOpModel, InvalidTypeTest) {
 #endif
 }  // namespace
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index bae4229cbab..cf5c930455f 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -12,25 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Deserialization infrastructure for tflite. Provides functionality
-// to go from a serialized tflite model in flatbuffer format to an
-// interpreter.
-//
-// using namespace tflite;
-// StderrReporter error_reporter;
-// auto model = FlatBufferModel::BuildFromFile("interesting_model.tflite",
-//                                             &error_reporter);
-// MyOpResolver resolver;  // You need to subclass OpResolver to provide
-//                         // implementations.
-// InterpreterBuilder builder(*model, resolver);
-// std::unique_ptr<Interpreter> interpreter;
-// if(builder(&interpreter) == kTfLiteOk) {
-//   .. run model inference with interpreter
-// }
-//
-// OpResolver must be defined to provide your kernel implementations to the
-// interpreter. This is environment specific and may consist of just the builtin
-// ops, or some custom operators you defined to extend tflite.
+/// Deserialization infrastructure for tflite. Provides functionality
+/// to go from a serialized tflite model in flatbuffer format to an
+/// interpreter.
+///
 #ifndef TENSORFLOW_LITE_MODEL_H_
 #define TENSORFLOW_LITE_MODEL_H_
 
@@ -44,77 +29,95 @@ limitations under the License.
 
 namespace tflite {
 
-// Abstract interface that verifies whether a given model is legit.
-// It facilitates the use-case to verify and build a model without loading it
-// twice.
+/// Abstract interface that verifies whether a given model is legit.
+/// It facilitates the use-case to verify and build a model without loading it
+/// twice.
 class TfLiteVerifier {
  public:
-  // Returns true if the model is legit.
+  /// Returns true if the model is legit.
   virtual bool Verify(const char* data, int length,
                       ErrorReporter* reporter) = 0;
   virtual ~TfLiteVerifier() {}
 };
 
-// An RAII object that represents a read-only tflite model, copied from disk,
-// or mmapped. This uses flatbuffers as the serialization format.
-//
-// NOTE: The current API requires that a FlatBufferModel instance be kept alive
-// by the client as long as it is in use by any dependent Interpreter instances.
+/// An RAII object that represents a read-only tflite model, copied from disk,
+/// or mmapped. This uses flatbuffers as the serialization format.
+///
+/// NOTE: The current API requires that a FlatBufferModel instance be kept alive
+/// by the client as long as it is in use by any dependent Interpreter
+/// instances.
+/// <pre><code>
+/// using namespace tflite;
+/// StderrReporter error_reporter;
+/// auto model = FlatBufferModel::BuildFromFile("interesting_model.tflite",
+///                                             &error_reporter);
+/// MyOpResolver resolver;  // You need to subclass OpResolver to provide
+///                         // implementations.
+/// InterpreterBuilder builder(*model, resolver);
+/// std::unique_ptr<Interpreter> interpreter;
+/// if(builder(&interpreter) == kTfLiteOk) {
+///   .. run model inference with interpreter
+/// }
+/// </code></pre>
+///
+/// OpResolver must be defined to provide your kernel implementations to the
+/// interpreter. This is environment specific and may consist of just the
+/// builtin ops, or some custom operators you defined to extend tflite.
 class FlatBufferModel {
  public:
-  // Builds a model based on a file.
-  // Caller retains ownership of `error_reporter` and must ensure its lifetime
-  // is longer than the FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
+  /// Builds a model based on a file.
+  /// Caller retains ownership of `error_reporter` and must ensure its lifetime
+  /// is longer than the FlatBufferModel instance.
+  /// Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromFile(
       const char* filename,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Verifies whether the content of the file is legit, then builds a model
-  // based on the file.
-  // The extra_verifier argument is an additional optional verifier for the file
-  // contents. By default, we always check with tflite::VerifyModelBuffer. If
-  // extra_verifier is supplied, the file contents is also checked against the
-  // extra_verifier after the check against tflite::VerifyModelBuilder.
-  // Caller retains ownership of `error_reporter` and must ensure its lifetime
-  // is longer than the FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
+  /// Verifies whether the content of the file is legit, then builds a model
+  /// based on the file.
+  /// The extra_verifier argument is an additional optional verifier for the
+  /// file contents. By default, we always check with tflite::VerifyModelBuffer.
+  /// If extra_verifier is supplied, the file contents is also checked against
+  /// the extra_verifier after the check against tflite::VerifyModelBuilder.
+  /// Caller retains ownership of `error_reporter` and must ensure its lifetime
+  /// is longer than the FlatBufferModel instance.
+  /// Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
       const char* filename, TfLiteVerifier* extra_verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Builds a model based on a pre-loaded flatbuffer.
-  // Caller retains ownership of the buffer and should keep it alive until
-  // the returned object is destroyed. Caller also retains ownership of
-  // `error_reporter` and must ensure its lifetime is longer than the
-  // FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
-  // NOTE: this does NOT validate the buffer so it should NOT be called on
-  // invalid/untrusted input. Use VerifyAndBuildFromBuffer in that case
+  /// Builds a model based on a pre-loaded flatbuffer.
+  /// Caller retains ownership of the buffer and should keep it alive until
+  /// the returned object is destroyed. Caller also retains ownership of
+  /// `error_reporter` and must ensure its lifetime is longer than the
+  /// FlatBufferModel instance.
+  /// Returns a nullptr in case of failure.
+  /// NOTE: this does NOT validate the buffer so it should NOT be called on
+  /// invalid/untrusted input. Use VerifyAndBuildFromBuffer in that case
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
       const char* caller_owned_buffer, size_t buffer_size,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Verifies whether the content of the buffer is legit, then builds a model
-  // based on the pre-loaded flatbuffer.
-  // The extra_verifier argument is an additional optional verifier for the
-  // buffer. By default, we always check with tflite::VerifyModelBuffer. If
-  // extra_verifier is supplied, the buffer is checked against the
-  // extra_verifier after the check against tflite::VerifyModelBuilder. The
-  // caller retains ownership of the buffer and should keep it alive until the
-  // returned object is destroyed. Caller retains ownership of `error_reporter`
-  // and must ensure its lifetime is longer than the FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
+  /// Verifies whether the content of the buffer is legit, then builds a model
+  /// based on the pre-loaded flatbuffer.
+  /// The extra_verifier argument is an additional optional verifier for the
+  /// buffer. By default, we always check with tflite::VerifyModelBuffer. If
+  /// extra_verifier is supplied, the buffer is checked against the
+  /// extra_verifier after the check against tflite::VerifyModelBuilder. The
+  /// caller retains ownership of the buffer and should keep it alive until the
+  /// returned object is destroyed. Caller retains ownership of `error_reporter`
+  /// and must ensure its lifetime is longer than the FlatBufferModel instance.
+  /// Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromBuffer(
       const char* buffer, size_t buffer_size,
       TfLiteVerifier* extra_verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Builds a model directly from a flatbuffer pointer
-  // Caller retains ownership of the buffer and should keep it alive until the
-  // returned object is destroyed. Caller retains ownership of `error_reporter`
-  // and must ensure its lifetime is longer than the FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
+  /// Builds a model directly from a flatbuffer pointer
+  /// Caller retains ownership of the buffer and should keep it alive until the
+  /// returned object is destroyed. Caller retains ownership of `error_reporter`
+  /// and must ensure its lifetime is longer than the FlatBufferModel instance.
+  /// Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromModel(
       const tflite::Model* caller_owned_model_spec,
       ErrorReporter* error_reporter = DefaultErrorReporter());
@@ -132,58 +135,60 @@ class FlatBufferModel {
   ErrorReporter* error_reporter() const { return error_reporter_; }
   const Allocation* allocation() const { return allocation_.get(); }
 
-  // Returns true if the model identifier is correct (otherwise false and
-  // reports an error).
+  /// Returns true if the model identifier is correct (otherwise false and
+  /// reports an error).
   bool CheckModelIdentifier() const;
 
  private:
-  // Loads a model from a given allocation. FlatBufferModel will take over the
-  // ownership of `allocation`, and delete it in destructor. The ownership of
-  // `error_reporter`remains with the caller and must have lifetime at least
-  // as much as FlatBufferModel. This is to allow multiple models to use the
-  // same ErrorReporter instance.
+  /// Loads a model from a given allocation. FlatBufferModel will take over the
+  /// ownership of `allocation`, and delete it in destructor. The ownership of
+  /// `error_reporter`remains with the caller and must have lifetime at least
+  /// as much as FlatBufferModel. This is to allow multiple models to use the
+  /// same ErrorReporter instance.
   FlatBufferModel(std::unique_ptr<Allocation> allocation,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Loads a model from Model flatbuffer. The `model` has to remain alive and
-  // unchanged until the end of this flatbuffermodel's lifetime.
+  /// Loads a model from Model flatbuffer. The `model` has to remain alive and
+  /// unchanged until the end of this flatbuffermodel's lifetime.
   FlatBufferModel(const Model* model, ErrorReporter* error_reporter);
 
-  // Flatbuffer traverser pointer. (Model* is a pointer that is within the
-  // allocated memory of the data allocated by allocation's internals.
+  /// Flatbuffer traverser pointer. (Model* is a pointer that is within the
+  /// allocated memory of the data allocated by allocation's internals.
   const tflite::Model* model_ = nullptr;
-  // The error reporter to use for model errors and subsequent errors when
-  // the interpreter is created
+  /// The error reporter to use for model errors and subsequent errors when
+  /// the interpreter is created
   ErrorReporter* error_reporter_;
-  // The allocator used for holding memory of the model. Note that this will
-  // be null if the client provides a tflite::Model directly.
+  /// The allocator used for holding memory of the model. Note that this will
+  /// be null if the client provides a tflite::Model directly.
   std::unique_ptr<Allocation> allocation_;
 };
 
-// Build an interpreter capable of interpreting `model`.
-//
-// model: A model whose lifetime must be at least as long as any
-//   interpreter(s) created by the builder. In principle multiple interpreters
-//   can be made from a single model.
-// op_resolver: An instance that implements the OpResolver interface, which maps
-//   custom op names and builtin op codes to op registrations. The lifetime
-//   of the provided `op_resolver` object must be at least as long as the
-//   InterpreterBuilder; unlike `model` and `error_reporter`, the `op_resolver`
-//   does not need to exist for the duration of any created Interpreter objects.
-// error_reporter: a functor that is called to report errors that handles
-//   printf var arg semantics. The lifetime of the `error_reporter` object must
-//   be greater than or equal to the Interpreter created by operator().
-//
-// Returns a kTfLiteOk when successful and sets interpreter to a valid
-// Interpreter. Note: The user must ensure the model lifetime (and error
-// reporter, if provided) is at least as long as interpreter's lifetime.
+/// Build an interpreter capable of interpreting `model`.
+///
+/// model: A model whose lifetime must be at least as long as any
+///   interpreter(s) created by the builder. In principle multiple interpreters
+///   can be made from a single model.
+/// op_resolver: An instance that implements the OpResolver interface, which
+/// maps
+///   custom op names and builtin op codes to op registrations. The lifetime
+///   of the provided `op_resolver` object must be at least as long as the
+///   InterpreterBuilder; unlike `model` and `error_reporter`, the `op_resolver`
+///   does not need to exist for the duration of any created Interpreter
+///   objects.
+/// error_reporter: a functor that is called to report errors that handles
+///   printf var arg semantics. The lifetime of the `error_reporter` object must
+///   be greater than or equal to the Interpreter created by operator().
+///
+/// Returns a kTfLiteOk when successful and sets interpreter to a valid
+/// Interpreter. Note: The user must ensure the model lifetime (and error
+/// reporter, if provided) is at least as long as interpreter's lifetime.
 class InterpreterBuilder {
  public:
   InterpreterBuilder(const FlatBufferModel& model,
                      const OpResolver& op_resolver);
-  // Builds an interpreter given only the raw flatbuffer Model object (instead
-  // of a FlatBufferModel). Mostly used for testing.
-  // If `error_reporter` is null, then DefaultErrorReporter() is used.
+  /// Builds an interpreter given only the raw flatbuffer Model object (instead
+  /// of a FlatBufferModel). Mostly used for testing.
+  /// If `error_reporter` is null, then DefaultErrorReporter() is used.
   InterpreterBuilder(const ::tflite::Model* model,
                      const OpResolver& op_resolver,
                      ErrorReporter* error_reporter = DefaultErrorReporter());
diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
index 73326e994bc..6501a994d8d 100644
--- a/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
@@ -1,8 +1,9 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow/lite:build_def.bzl",
diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/assets/BUILD b/tensorflow/lite/models/smartreply/demo/app/src/main/assets/BUILD
index 3c882ffc43f..86023676484 100644
--- a/tensorflow/lite/models/smartreply/demo/app/src/main/assets/BUILD
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/assets/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(glob(["*"]))
 
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 8b85a5f1182..7ea58c583e1 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -1,8 +1,9 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = [
-    "//visibility:public",
-])
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "nnapi_lib",
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index d476422a02d..8adf83d730f 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -39,6 +39,7 @@ enum {
   ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
   ANEURALNETWORKS_TENSOR_INT32 = 4,
   ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
+  ANEURALNETWORKS_BOOL = 6,
   ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
 };
 
@@ -89,6 +90,7 @@ enum {
   ANEURALNETWORKS_ABS = 38,
   ANEURALNETWORKS_EXP = 49,
   ANEURALNETWORKS_LOG = 60,
+  ANEURALNETWORKS_PAD_V2 = 69,
   ANEURALNETWORKS_PRELU = 71,
   ANEURALNETWORKS_RSQRT = 83,
   ANEURALNETWORKS_SIN = 85,
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 6b3de3ca594..bc5159f6e4a 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -99,12 +99,15 @@ int ASharedMemory_create(const char* name, size_t size) {
   nnapi.name = reinterpret_cast<name##_fn>(  \
       LoadFunction(handle, #name, /*optional*/ true));
 
+#define LOAD_FUNCTION_RENAME(handle, name, symbol) \
+  nnapi.name = reinterpret_cast<name##_fn>(        \
+      LoadFunction(handle, symbol, /*optional*/ false));
+
 const NnApi LoadNnApi() {
   NnApi nnapi = {};
   nnapi.android_sdk_version = 0;
 
 #ifdef __ANDROID__
-  void* libandroid = nullptr;
   nnapi.android_sdk_version = GetAndroidSdkVersion();
   if (nnapi.android_sdk_version < 27) {
     NNAPI_LOG("nnapi error: requires android sdk version to be at least %d",
@@ -112,10 +115,6 @@ const NnApi LoadNnApi() {
     nnapi.nnapi_exists = false;
     return nnapi;
   }
-  libandroid = dlopen("libandroid.so", RTLD_LAZY | RTLD_LOCAL);
-  if (libandroid == nullptr) {
-    NNAPI_LOG("nnapi error: unable to open library %s", "libandroid.so");
-  }
 #endif  // __ANDROID__
 
   void* libneuralnetworks = nullptr;
@@ -128,6 +127,7 @@ const NnApi LoadNnApi() {
 
   nnapi.nnapi_exists = libneuralnetworks != nullptr;
 
+  // API 27 (NN 1.0) methods.
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksMemory_createFromFd);
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksMemory_free);
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_create);
@@ -143,8 +143,6 @@ const NnApi LoadNnApi() {
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_addOperation);
   LOAD_FUNCTION(libneuralnetworks,
                 ANeuralNetworksModel_identifyInputsAndOutputs);
-  LOAD_FUNCTION(libneuralnetworks,
-                ANeuralNetworksModel_relaxComputationFloat32toFloat16);
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_create);
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_free);
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_setPreference);
@@ -159,11 +157,35 @@ const NnApi LoadNnApi() {
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_startCompute);
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksEvent_wait);
   LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksEvent_free);
+
+  // ASharedMemory_create has different implementations in Android depending on
+  // the partition. Generally it can be loaded from libandroid.so but in vendor
+  // partition (e.g. if a HAL wants to use NNAPI) it is only accessible through
+  // libcutils.
 #ifdef __ANDROID__
-  LOAD_FUNCTION(libandroid, ASharedMemory_create);
+  void* libandroid = nullptr;
+  libandroid = dlopen("libandroid.so", RTLD_LAZY | RTLD_LOCAL);
+  if (libandroid != nullptr) {
+    LOAD_FUNCTION(libandroid, ASharedMemory_create);
+  } else {
+    void* cutils_handle = dlopen("libcutils.so", RTLD_LAZY | RTLD_LOCAL);
+    if (cutils_handle != nullptr) {
+      LOAD_FUNCTION_RENAME(cutils_handle, ASharedMemory_create,
+                           "ashmem_create_region");
+    } else {
+      NNAPI_LOG("nnapi error: unable to open neither libraries %s and %s",
+                "libandroid.so", "libcutils.so");
+    }
+  }
 #else
   nnapi.ASharedMemory_create = ASharedMemory_create;
 #endif  // __ANDROID__
+
+  // API 28 (NN 1.1) methods.
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksModel_relaxComputationFloat32toFloat16);
+
+  // API 29 (NN 1.2) methods.
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworks_getDeviceCount);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworks_getDevice);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksDevice_getName);
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index 452c53aff94..4779d47d838 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 17953e21d03..a7fcf51718a 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 6ec7ce497a5..0c440809d78 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 
 cc_library(
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 0d9c22539ff..d10a51631a2 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -402,8 +402,8 @@ class TFLiteConverterV2(TFLiteConverterBase):
 class TFLiteConverter(TFLiteConverterBase):
   """Convert a TensorFlow model into `output_format`.
 
-  This is used to convert from a TensorFlow GraphDef or SavedModel into either a
-  TFLite FlatBuffer or graph visualization.
+  This is used to convert from a TensorFlow GraphDef, SavedModel or tf.keras
+  model into either a TFLite FlatBuffer or graph visualization.
 
   Attributes:
     inference_type: Target data type of real-number arrays in the output file.
@@ -490,10 +490,12 @@ class TFLiteConverter(TFLiteConverterBase):
     # Converting a SavedModel.
     converter = lite.TFLiteConverter.from_saved_model(saved_model_dir)
     tflite_model = converter.convert()
+    open("converted_model.tflite", "wb").write(tflite_model)
 
     # Converting a tf.keras model.
     converter = lite.TFLiteConverter.from_keras_model_file(keras_model)
     tflite_model = converter.convert()
+    open("converted_model.tflite", "wb").write(tflite_model)
     ```
   """
 
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 61968fb6d8b..07e597813f7 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 
 cc_library(
diff --git a/tensorflow/lite/schema/builtin_ops_header/BUILD b/tensorflow/lite/schema/builtin_ops_header/BUILD
index 52cbd052d6a..85576cd06eb 100644
--- a/tensorflow/lite/schema/builtin_ops_header/BUILD
+++ b/tensorflow/lite/schema/builtin_ops_header/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "generator",
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index b5fc0f31be0..c29fe05b529 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -418,6 +418,11 @@ table FullyConnectedOptions {
 
   // Parameters for FullyConnected version 2 or above.
   weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimension is preserved. Furthermore,
+  // all but the last dimension of the input and output shapes will be equal.
+  keep_num_dims: bool;
 }
 
 table SoftmaxOptions {
@@ -844,6 +849,13 @@ table Buffer {
   data:[ubyte] (force_align: 16);
 }
 
+table Metadata {
+  // A human readable string to uniquely identify a Metadata.
+  name:string;
+  // An index to the buffers table.
+  buffer:uint;
+}
+
 table Model {
   // Version of the schema.
   version:uint;
@@ -866,8 +878,12 @@ table Model {
   // their buffer.
   buffers:[Buffer];
 
-  // Metadata about the model.  Indirects into the existings buffers list.
+  // Metadata about the model. Indirects into the existings buffers list.
+  // Deprecated, prefer to use metadata field.
   metadata_buffer:[int];
+
+  // Metadata about the model.
+  metadata:[Metadata];
 }
 
 root_type Model;
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 6d14eb4dc79..d661bb3c8a2 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -365,6 +365,7 @@ inline const char * const *EnumNamesTensorType() {
 }
 
 inline const char *EnumNameTensorType(TensorType e) {
+  if (e < TensorType_FLOAT32 || e > TensorType_INT8) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesTensorType()[index];
 }
@@ -394,6 +395,7 @@ inline const char * const *EnumNamesQuantizationDetails() {
 }
 
 inline const char *EnumNameQuantizationDetails(QuantizationDetails e) {
+  if (e < QuantizationDetails_NONE || e > QuantizationDetails_CustomQuantization) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesQuantizationDetails()[index];
 }
@@ -818,6 +820,7 @@ inline const char * const *EnumNamesBuiltinOperator() {
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
+  if (e < BuiltinOperator_ADD || e > BuiltinOperator_MATRIX_SET_DIAG) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -1114,6 +1117,7 @@ inline const char * const *EnumNamesBuiltinOptions() {
 }
 
 inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
+  if (e < BuiltinOptions_NONE || e > BuiltinOptions_MatrixSetDiagOptions) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesBuiltinOptions()[index];
 }
@@ -2263,6 +2267,7 @@ inline const char * const *EnumNamesPadding() {
 }
 
 inline const char *EnumNamePadding(Padding e) {
+  if (e < Padding_SAME || e > Padding_VALID) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesPadding()[index];
 }
@@ -2304,6 +2309,7 @@ inline const char * const *EnumNamesActivationFunctionType() {
 }
 
 inline const char *EnumNameActivationFunctionType(ActivationFunctionType e) {
+  if (e < ActivationFunctionType_NONE || e > ActivationFunctionType_SIGN_BIT) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesActivationFunctionType()[index];
 }
@@ -2336,6 +2342,7 @@ inline const char * const *EnumNamesLSHProjectionType() {
 }
 
 inline const char *EnumNameLSHProjectionType(LSHProjectionType e) {
+  if (e < LSHProjectionType_UNKNOWN || e > LSHProjectionType_DENSE) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesLSHProjectionType()[index];
 }
@@ -2365,6 +2372,7 @@ inline const char * const *EnumNamesFullyConnectedOptionsWeightsFormat() {
 }
 
 inline const char *EnumNameFullyConnectedOptionsWeightsFormat(FullyConnectedOptionsWeightsFormat e) {
+  if (e < FullyConnectedOptionsWeightsFormat_DEFAULT || e > FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesFullyConnectedOptionsWeightsFormat()[index];
 }
@@ -2394,6 +2402,7 @@ inline const char * const *EnumNamesLSTMKernelType() {
 }
 
 inline const char *EnumNameLSTMKernelType(LSTMKernelType e) {
+  if (e < LSTMKernelType_FULL || e > LSTMKernelType_BASIC) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesLSTMKernelType()[index];
 }
@@ -2426,6 +2435,7 @@ inline const char * const *EnumNamesCombinerType() {
 }
 
 inline const char *EnumNameCombinerType(CombinerType e) {
+  if (e < CombinerType_SUM || e > CombinerType_SQRTN) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesCombinerType()[index];
 }
@@ -2455,6 +2465,7 @@ inline const char * const *EnumNamesMirrorPadMode() {
 }
 
 inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
+  if (e < MirrorPadMode_REFLECT || e > MirrorPadMode_SYMMETRIC) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesMirrorPadMode()[index];
 }
@@ -2481,6 +2492,7 @@ inline const char * const *EnumNamesCustomOptionsFormat() {
 }
 
 inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
+  if (e < CustomOptionsFormat_FLEXBUFFERS || e > CustomOptionsFormat_FLEXBUFFERS) return "";
   const size_t index = static_cast<int>(e);
   return EnumNamesCustomOptionsFormat()[index];
 }
@@ -2494,7 +2506,7 @@ struct CustomQuantizationT : public flatbuffers::NativeTable {
 
 struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef CustomQuantizationT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_CUSTOM = 4
   };
   const flatbuffers::Vector<uint8_t> *custom() const {
@@ -2562,7 +2574,7 @@ struct QuantizationParametersT : public flatbuffers::NativeTable {
 
 struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef QuantizationParametersT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_MIN = 4,
     VT_MAX = 6,
     VT_SCALE = 8,
@@ -2716,7 +2728,7 @@ struct TensorT : public flatbuffers::NativeTable {
 
 struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef TensorT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_SHAPE = 4,
     VT_TYPE = 6,
     VT_BUFFER = 8,
@@ -2851,7 +2863,7 @@ struct Conv2DOptionsT : public flatbuffers::NativeTable {
 
 struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef Conv2DOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_PADDING = 4,
     VT_STRIDE_W = 6,
     VT_STRIDE_H = 8,
@@ -2965,7 +2977,7 @@ struct Pool2DOptionsT : public flatbuffers::NativeTable {
 
 struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef Pool2DOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_PADDING = 4,
     VT_STRIDE_W = 6,
     VT_STRIDE_H = 8,
@@ -3081,7 +3093,7 @@ struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
 
 struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef DepthwiseConv2DOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_PADDING = 4,
     VT_STRIDE_W = 6,
     VT_STRIDE_H = 8,
@@ -3197,7 +3209,7 @@ struct ConcatEmbeddingsOptionsT : public flatbuffers::NativeTable {
 
 struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ConcatEmbeddingsOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NUM_CHANNELS = 4,
     VT_NUM_COLUMNS_PER_CHANNEL = 6,
     VT_EMBEDDING_DIM_PER_CHANNEL = 8
@@ -3285,7 +3297,7 @@ struct LSHProjectionOptionsT : public flatbuffers::NativeTable {
 
 struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef LSHProjectionOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_TYPE = 4
   };
   LSHProjectionType type() const {
@@ -3341,7 +3353,7 @@ struct SVDFOptionsT : public flatbuffers::NativeTable {
 
 struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SVDFOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_RANK = 4,
     VT_FUSED_ACTIVATION_FUNCTION = 6
   };
@@ -3405,7 +3417,7 @@ struct RNNOptionsT : public flatbuffers::NativeTable {
 
 struct RNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef RNNOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4
   };
   ActivationFunctionType fused_activation_function() const {
@@ -3461,7 +3473,7 @@ struct SequenceRNNOptionsT : public flatbuffers::NativeTable {
 
 struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SequenceRNNOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_TIME_MAJOR = 4,
     VT_FUSED_ACTIVATION_FUNCTION = 6
   };
@@ -3529,7 +3541,7 @@ struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
 
 struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef BidirectionalSequenceRNNOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_TIME_MAJOR = 4,
     VT_FUSED_ACTIVATION_FUNCTION = 6,
     VT_MERGE_OUTPUTS = 8
@@ -3597,17 +3609,20 @@ struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
   typedef FullyConnectedOptions TableType;
   ActivationFunctionType fused_activation_function;
   FullyConnectedOptionsWeightsFormat weights_format;
+  bool keep_num_dims;
   FullyConnectedOptionsT()
       : fused_activation_function(ActivationFunctionType_NONE),
-        weights_format(FullyConnectedOptionsWeightsFormat_DEFAULT) {
+        weights_format(FullyConnectedOptionsWeightsFormat_DEFAULT),
+        keep_num_dims(false) {
   }
 };
 
 struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef FullyConnectedOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
-    VT_WEIGHTS_FORMAT = 6
+    VT_WEIGHTS_FORMAT = 6,
+    VT_KEEP_NUM_DIMS = 8
   };
   ActivationFunctionType fused_activation_function() const {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
@@ -3615,10 +3630,14 @@ struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tabl
   FullyConnectedOptionsWeightsFormat weights_format() const {
     return static_cast<FullyConnectedOptionsWeightsFormat>(GetField<int8_t>(VT_WEIGHTS_FORMAT, 0));
   }
+  bool keep_num_dims() const {
+    return GetField<uint8_t>(VT_KEEP_NUM_DIMS, 0) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            VerifyField<int8_t>(verifier, VT_WEIGHTS_FORMAT) &&
+           VerifyField<uint8_t>(verifier, VT_KEEP_NUM_DIMS) &&
            verifier.EndTable();
   }
   FullyConnectedOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3635,6 +3654,9 @@ struct FullyConnectedOptionsBuilder {
   void add_weights_format(FullyConnectedOptionsWeightsFormat weights_format) {
     fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_WEIGHTS_FORMAT, static_cast<int8_t>(weights_format), 0);
   }
+  void add_keep_num_dims(bool keep_num_dims) {
+    fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_KEEP_NUM_DIMS, static_cast<uint8_t>(keep_num_dims), 0);
+  }
   explicit FullyConnectedOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3650,8 +3672,10 @@ struct FullyConnectedOptionsBuilder {
 inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
     ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    FullyConnectedOptionsWeightsFormat weights_format = FullyConnectedOptionsWeightsFormat_DEFAULT) {
+    FullyConnectedOptionsWeightsFormat weights_format = FullyConnectedOptionsWeightsFormat_DEFAULT,
+    bool keep_num_dims = false) {
   FullyConnectedOptionsBuilder builder_(_fbb);
+  builder_.add_keep_num_dims(keep_num_dims);
   builder_.add_weights_format(weights_format);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
@@ -3669,7 +3693,7 @@ struct SoftmaxOptionsT : public flatbuffers::NativeTable {
 
 struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SoftmaxOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_BETA = 4
   };
   float beta() const {
@@ -3725,7 +3749,7 @@ struct ConcatenationOptionsT : public flatbuffers::NativeTable {
 
 struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ConcatenationOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_AXIS = 4,
     VT_FUSED_ACTIVATION_FUNCTION = 6
   };
@@ -3789,7 +3813,7 @@ struct AddOptionsT : public flatbuffers::NativeTable {
 
 struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef AddOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4
   };
   ActivationFunctionType fused_activation_function() const {
@@ -3843,7 +3867,7 @@ struct MulOptionsT : public flatbuffers::NativeTable {
 
 struct MulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef MulOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4
   };
   ActivationFunctionType fused_activation_function() const {
@@ -3897,7 +3921,7 @@ struct L2NormOptionsT : public flatbuffers::NativeTable {
 
 struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef L2NormOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4
   };
   ActivationFunctionType fused_activation_function() const {
@@ -3957,7 +3981,7 @@ struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
 
 struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef LocalResponseNormalizationOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_RADIUS = 4,
     VT_BIAS = 6,
     VT_ALPHA = 8,
@@ -4047,7 +4071,7 @@ struct LSTMOptionsT : public flatbuffers::NativeTable {
 
 struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef LSTMOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_CELL_CLIP = 6,
     VT_PROJ_CLIP = 8,
@@ -4137,7 +4161,7 @@ struct UnidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
 
 struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef UnidirectionalSequenceLSTMOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_CELL_CLIP = 6,
     VT_PROJ_CLIP = 8,
@@ -4229,7 +4253,7 @@ struct BidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
 
 struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef BidirectionalSequenceLSTMOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_CELL_CLIP = 6,
     VT_PROJ_CLIP = 8,
@@ -4323,7 +4347,7 @@ struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
 
 struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ResizeBilinearOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_ALIGN_CORNERS = 8
   };
   bool align_corners() const {
@@ -4377,7 +4401,7 @@ struct ResizeNearestNeighborOptionsT : public flatbuffers::NativeTable {
 
 struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ResizeNearestNeighborOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_ALIGN_CORNERS = 4
   };
   bool align_corners() const {
@@ -4431,7 +4455,7 @@ struct CallOptionsT : public flatbuffers::NativeTable {
 
 struct CallOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef CallOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_SUBGRAPH = 4
   };
   uint32_t subgraph() const {
@@ -4564,7 +4588,7 @@ struct ReshapeOptionsT : public flatbuffers::NativeTable {
 
 struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ReshapeOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NEW_SHAPE = 4
   };
   const flatbuffers::Vector<int32_t> *new_shape() const {
@@ -4711,7 +4735,7 @@ struct SkipGramOptionsT : public flatbuffers::NativeTable {
 
 struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SkipGramOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NGRAM_SIZE = 4,
     VT_MAX_SKIP_SIZE = 6,
     VT_INCLUDE_ALL_NGRAMS = 8
@@ -4785,7 +4809,7 @@ struct SpaceToDepthOptionsT : public flatbuffers::NativeTable {
 
 struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SpaceToDepthOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_BLOCK_SIZE = 4
   };
   int32_t block_size() const {
@@ -4839,7 +4863,7 @@ struct SubOptionsT : public flatbuffers::NativeTable {
 
 struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SubOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4
   };
   ActivationFunctionType fused_activation_function() const {
@@ -4893,7 +4917,7 @@ struct DivOptionsT : public flatbuffers::NativeTable {
 
 struct DivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef DivOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_FUSED_ACTIVATION_FUNCTION = 4
   };
   ActivationFunctionType fused_activation_function() const {
@@ -4987,7 +5011,7 @@ struct EmbeddingLookupSparseOptionsT : public flatbuffers::NativeTable {
 
 struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef EmbeddingLookupSparseOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_COMBINER = 4
   };
   CombinerType combiner() const {
@@ -5041,7 +5065,7 @@ struct GatherOptionsT : public flatbuffers::NativeTable {
 
 struct GatherOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef GatherOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_AXIS = 4
   };
   int32_t axis() const {
@@ -5215,7 +5239,7 @@ struct ReducerOptionsT : public flatbuffers::NativeTable {
 
 struct ReducerOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ReducerOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_KEEP_DIMS = 4
   };
   bool keep_dims() const {
@@ -5268,7 +5292,7 @@ struct SqueezeOptionsT : public flatbuffers::NativeTable {
 
 struct SqueezeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SqueezeOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_SQUEEZE_DIMS = 4
   };
   const flatbuffers::Vector<int32_t> *squeeze_dims() const {
@@ -5331,7 +5355,7 @@ struct SplitOptionsT : public flatbuffers::NativeTable {
 
 struct SplitOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SplitOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NUM_SPLITS = 4
   };
   int32_t num_splits() const {
@@ -5385,7 +5409,7 @@ struct SplitVOptionsT : public flatbuffers::NativeTable {
 
 struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SplitVOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NUM_SPLITS = 4
   };
   int32_t num_splits() const {
@@ -5447,7 +5471,7 @@ struct StridedSliceOptionsT : public flatbuffers::NativeTable {
 
 struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef StridedSliceOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_BEGIN_MASK = 4,
     VT_END_MASK = 6,
     VT_ELLIPSIS_MASK = 8,
@@ -5583,7 +5607,7 @@ struct CastOptionsT : public flatbuffers::NativeTable {
 
 struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef CastOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_IN_DATA_TYPE = 4,
     VT_OUT_DATA_TYPE = 6
   };
@@ -5767,7 +5791,7 @@ struct ArgMaxOptionsT : public flatbuffers::NativeTable {
 
 struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ArgMaxOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_OUTPUT_TYPE = 4
   };
   TensorType output_type() const {
@@ -5821,7 +5845,7 @@ struct ArgMinOptionsT : public flatbuffers::NativeTable {
 
 struct ArgMinOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ArgMinOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_OUTPUT_TYPE = 4
   };
   TensorType output_type() const {
@@ -6159,7 +6183,7 @@ struct TransposeConvOptionsT : public flatbuffers::NativeTable {
 
 struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef TransposeConvOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_PADDING = 4,
     VT_STRIDE_W = 6,
     VT_STRIDE_H = 8
@@ -6273,7 +6297,7 @@ struct SparseToDenseOptionsT : public flatbuffers::NativeTable {
 
 struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SparseToDenseOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_VALIDATE_INDICES = 4
   };
   bool validate_indices() const {
@@ -6407,7 +6431,7 @@ struct ShapeOptionsT : public flatbuffers::NativeTable {
 
 struct ShapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ShapeOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_OUT_TYPE = 4
   };
   TensorType out_type() const {
@@ -6547,7 +6571,7 @@ struct FakeQuantOptionsT : public flatbuffers::NativeTable {
 
 struct FakeQuantOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef FakeQuantOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_MIN = 4,
     VT_MAX = 6,
     VT_NUM_BITS = 8,
@@ -6633,7 +6657,7 @@ struct PackOptionsT : public flatbuffers::NativeTable {
 
 struct PackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef PackOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_VALUES_COUNT = 4,
     VT_AXIS = 6
   };
@@ -6737,7 +6761,7 @@ struct OneHotOptionsT : public flatbuffers::NativeTable {
 
 struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef OneHotOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_AXIS = 4
   };
   int32_t axis() const {
@@ -6913,7 +6937,7 @@ struct UnpackOptionsT : public flatbuffers::NativeTable {
 
 struct UnpackOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef UnpackOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_NUM = 4,
     VT_AXIS = 6
   };
@@ -7217,7 +7241,7 @@ struct LeakyReluOptionsT : public flatbuffers::NativeTable {
 
 struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef LeakyReluOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_ALPHA = 4
   };
   float alpha() const {
@@ -7311,7 +7335,7 @@ struct MirrorPadOptionsT : public flatbuffers::NativeTable {
 
 struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef MirrorPadOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_MODE = 4
   };
   MirrorPadMode mode() const {
@@ -7365,7 +7389,7 @@ struct UniqueOptionsT : public flatbuffers::NativeTable {
 
 struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef UniqueOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_IDX_OUT_TYPE = 4
   };
   TensorType idx_out_type() const {
@@ -7581,7 +7605,7 @@ struct ReverseSequenceOptionsT : public flatbuffers::NativeTable {
 
 struct ReverseSequenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ReverseSequenceOptionsT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_SEQ_DIM = 4,
     VT_BATCH_DIM = 6
   };
@@ -7768,7 +7792,7 @@ struct OperatorCodeT : public flatbuffers::NativeTable {
 
 struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef OperatorCodeT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_BUILTIN_CODE = 4,
     VT_CUSTOM_CODE = 6,
     VT_VERSION = 8
@@ -7862,7 +7886,7 @@ struct OperatorT : public flatbuffers::NativeTable {
 
 struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef OperatorT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_OPCODE_INDEX = 4,
     VT_INPUTS = 6,
     VT_OUTPUTS = 8,
@@ -8647,7 +8671,7 @@ struct SubGraphT : public flatbuffers::NativeTable {
 
 struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SubGraphT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_TENSORS = 4,
     VT_INPUTS = 6,
     VT_OUTPUTS = 8,
@@ -8763,7 +8787,7 @@ struct BufferT : public flatbuffers::NativeTable {
 
 struct Buffer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef BufferT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_DATA = 4
   };
   const flatbuffers::Vector<uint8_t> *data() const {
@@ -8831,7 +8855,7 @@ struct ModelT : public flatbuffers::NativeTable {
 
 struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ModelT NativeTableType;
-  enum {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_VERSION = 4,
     VT_OPERATOR_CODES = 6,
     VT_SUBGRAPHS = 8,
@@ -9373,6 +9397,7 @@ inline void FullyConnectedOptions::UnPackTo(FullyConnectedOptionsT *_o, const fl
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
   { auto _e = weights_format(); _o->weights_format = _e; };
+  { auto _e = keep_num_dims(); _o->keep_num_dims = _e; };
 }
 
 inline flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -9385,10 +9410,12 @@ inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(fl
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FullyConnectedOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _weights_format = _o->weights_format;
+  auto _keep_num_dims = _o->keep_num_dims;
   return tflite::CreateFullyConnectedOptions(
       _fbb,
       _fused_activation_function,
-      _weights_format);
+      _weights_format,
+      _keep_num_dims);
 }
 
 inline SoftmaxOptionsT *SoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 3059bea2f75..788ee5b07c2 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow/lite:build_def.bzl",
@@ -13,9 +14,9 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
     "//tensorflow:tensorflow.bzl",
+    "py_test",  # @unused
     "tf_cc_binary",
     "tf_cc_test",
-    "py_test",
 )
 
 [gen_zip_test(
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index 31daaf2f44f..42bab697725 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -1604,6 +1604,48 @@ def make_gather_with_constant_tests(options):
   make_zip_of_tests(options, test_parameters, build_graph, build_inputs)
 
 
+@register_make_test_function()
+def make_embedding_lookup_tests(options):
+  """Make a set of tests to do gather."""
+
+  test_parameters = [
+      {
+          "params_dtype": [tf.float32],
+          "params_shape": [[10], [10, 10]],
+          "ids_dtype": [tf.int32],
+          "ids_shape": [[3], [5]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the gather op testing graph."""
+    params = tf.placeholder(
+        dtype=parameters["params_dtype"],
+        name="params",
+        shape=parameters["params_shape"])
+    ids = tf.placeholder(
+        dtype=parameters["ids_dtype"],
+        name="ids",
+        shape=parameters["ids_shape"])
+    out = tf.nn.embedding_lookup(params, ids)
+    return [params, ids], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    params = create_tensor_data(parameters["params_dtype"],
+                                parameters["params_shape"])
+    ids = create_tensor_data(parameters["ids_dtype"],
+                             parameters["ids_shape"], 0,
+                             parameters["params_shape"][0] - 1)
+    return [params, ids], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [params, ids])))
+
+  make_zip_of_tests(
+      options,
+      test_parameters,
+      build_graph,
+      build_inputs)
+
+
 @register_make_test_function()
 def make_global_batch_norm_tests(options):
   """Make a set of tests to do batch_norm_with_global_normalization."""
@@ -3616,7 +3658,7 @@ def make_neg_tests(options):
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
-      "input_shape": [[1, 3, 4, 3], [5]],
+      "input_shape": [[1, 3, 4, 3], [5], []],
   }]
 
   def build_graph(parameters):
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
index 14186a1d164..f3254f4a4c4 100644
--- a/tensorflow/lite/testing/kernel_test/BUILD
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/lite/toco/args.h b/tensorflow/lite/toco/args.h
index c6eeb2859a9..1003a157e42 100644
--- a/tensorflow/lite/toco/args.h
+++ b/tensorflow/lite/toco/args.h
@@ -172,6 +172,7 @@ struct ParsedTocoFlags {
   Arg<bool> reorder_across_fake_quant = Arg<bool>(false);
   Arg<bool> allow_custom_ops = Arg<bool>(false);
   Arg<bool> post_training_quantize = Arg<bool>(false);
+  Arg<bool> quantize_to_float16 = Arg<bool>(false);
   // Deprecated flags
   Arg<bool> quantize_weights = Arg<bool>(false);
   Arg<string> input_type;
diff --git a/tensorflow/lite/toco/dump_graphviz.cc b/tensorflow/lite/toco/dump_graphviz.cc
index ad69e4f7b7a..95a34a7e4fb 100644
--- a/tensorflow/lite/toco/dump_graphviz.cc
+++ b/tensorflow/lite/toco/dump_graphviz.cc
@@ -37,7 +37,7 @@ using toco::port::StringF;
 namespace toco {
 namespace {
 
-// 'nslimit' is a graphviz (dot) paramater that limits the iterations during
+// 'nslimit' is a graphviz (dot) parameter that limits the iterations during
 // the layout phase. Omitting it allows infinite iterations, causing some
 // complex graphs to never finish. A value of 125 produces good graphs
 // while allowing complex graphs to finish.
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index ba269931bc4..9688c9adb39 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 0dc1944fe5e..b6c946f51bf 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -1,9 +1,10 @@
-package(default_visibility = [
-    "//tensorflow/lite:__subpackages__",
-    "//tensorflow/tools/pip_package:__subpackages__",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//tensorflow/lite:__subpackages__",
+        "//tensorflow/tools/pip_package:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
@@ -77,9 +78,6 @@ tf_py_test(
         "//tensorflow/lite/toco:model_flags_proto_py",
         "//tensorflow/lite/toco:toco_flags_proto_py",
     ],
-    data = [
-        ":toco_from_protos.par",
-    ],
     tags = [
         "no_oss",
         "no_pip",
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
index ae361bf212d..2aa67574ad2 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index cbd51817a96..b364c57c04d 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -1,10 +1,9 @@
 package(
     # To suppress build cleaner error about inclusion of schema_generate.h.
     features = ["-layering_check"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -14,8 +13,6 @@ cc_library(
     name = "operator",
     srcs = [
         "operator.cc",
-        "whitelisted_flex_ops.cc",
-        "whitelisted_flex_ops.h",
     ],
     hdrs = [
         "builtin_operator.h",
@@ -31,6 +28,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:ptr_util",
+        "//tensorflow/lite/delegates/flex:whitelisted_flex_ops_lib",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 8b0d38da068..e11544404c4 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "flatbuffers/flexbuffers.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
@@ -415,12 +416,10 @@ Offset<Vector<Offset<Buffer>>> ExportBuffers(
     const Model& model, const std::vector<const Array*>& buffers_to_write,
     FlatBufferBuilder* builder) {
   std::vector<Offset<Buffer>> buffer_vector;
-  size_t index = 0;
   for (const Array* array_ptr : buffers_to_write) {
     const Array& array = *array_ptr;
     Offset<Vector<uint8_t>> data_buffer = DataBuffer::Serialize(array, builder);
     buffer_vector.push_back(CreateBuffer(*builder, data_buffer));
-    index++;
   }
   return builder->CreateVector(buffer_vector);
 }
@@ -431,6 +430,31 @@ tensorflow::Status Export(const Model& model, string* output_file_contents,
   return Export(model, output_file_contents, params, ops_by_type);
 }
 
+void ParseControlFlowErrors(std::set<string>* custom_ops,
+                            std::vector<string>* error_msgs) {
+  std::set<string> unsupported_control_flow_ops;
+  // Check if unsupported ops contains control flow ops. It's impossible
+  // to implement these ops as custom ops at the moment.
+  for (const auto& op : *custom_ops) {
+    if (IsControlFlowOp(op)) {
+      unsupported_control_flow_ops.insert(op);
+    }
+  }
+  if (!unsupported_control_flow_ops.empty()) {
+    error_msgs->push_back(absl::StrCat(
+        "TensorFlow Lite currently doesn't support control flow ops: ",
+        absl::StrJoin(unsupported_control_flow_ops, ", "), ".",
+        " We are working on supporting control flow ops, please see github "
+        "issue at "
+        "https://github.com/tensorflow/tensorflow/issues/28485."));
+  }
+  // Remove control flow ops from `custom_ops` set so that they won't be
+  // reported again in later messages.
+  for (const auto& op : unsupported_control_flow_ops) {
+    custom_ops->erase(op);
+  }
+}
+
 tensorflow::Status Export(
     const Model& model, string* output_file_contents,
     const ExportParams& params,
@@ -483,6 +507,18 @@ tensorflow::Status Export(
 
   if (!custom_ops.empty()) {
     if (!params.allow_custom_ops) {
+      auto please_report_bug_message = []() {
+        return "We are continually in the process of adding support to "
+               "TensorFlow Lite for more ops. It would be helpful if you could "
+               "inform us of how this conversion went by opening a github "
+               "issue at "
+               "https://github.com/tensorflow/tensorflow/issues/new?template="
+               "40-tflite-op-request.md\n and pasting the following:\n\n";
+      };
+
+      std::vector<string> error_msgs;
+      ParseControlFlowErrors(&custom_ops, &error_msgs);
+
       // Remove ExpandDims and ReorderAxes from unimplemented list unless they
       // compose the list. Both ops are removed during graph transformations.
       // However, if an op is unimplemented earlier in the model, the graph
@@ -499,62 +535,45 @@ tensorflow::Status Export(
         custom_ops_final = custom_ops;
       }
 
-      auto please_report_bug_message = []() {
-        return "We are continually in the process of adding support to "
-               "TensorFlow Lite for more ops. It would be helpful if you could "
-               "inform us of how this conversion went by opening a github "
-               "issue at "
-               "https://github.com/tensorflow/tensorflow/issues/new?template="
-               "40-tflite-op-request.md\n and pasting the following:\n\n";
-      };
-
-      if (params.enable_select_tf_ops) {
-        return tensorflow::errors::InvalidArgument(absl::StrCat(
-            please_report_bug_message(),
-            "Some of the operators in the model are not supported by "
-            "the standard TensorFlow Lite runtime and are not recognized by "
-            "TensorFlow. If you have a custom "
-            "implementation for them you can disable this error with "
-            "--allow_custom_ops, or by setting allow_custom_ops=True "
-            "when calling tf.lite.TFLiteConverter(). Here is a list "
-            "of builtin operators you are using: ",
-            absl::StrJoin(builtin_ops, ", "),
-            ". Here is a list "
-            "of operators for which you will need custom implementations: ",
-            absl::StrJoin(custom_ops_final, ", "), "."));
-      } else {
-        return tensorflow::errors::InvalidArgument(absl::StrCat(
-            please_report_bug_message(),
-            "Some of the operators in the model are not supported by "
-            "the standard TensorFlow Lite runtime. If those are native "
-            "TensorFlow operators, you might be able to use the extended "
-            "runtime by passing --enable_select_tf_ops, or by setting "
-            "target_ops=TFLITE_BUILTINS,SELECT_TF_OPS when calling "
-            "tf.lite.TFLiteConverter(). Otherwise, if you have a "
-            "custom implementation for them you can disable this error with "
-            "--allow_custom_ops, or by setting allow_custom_ops=True "
-            "when calling tf.lite.TFLiteConverter(). Here is a list "
-            "of builtin operators you are using: ",
-            absl::StrJoin(builtin_ops, ", "),
-            ". Here is a list "
-            "of operators for which you will need custom implementations: ",
-            absl::StrJoin(custom_ops_final, ", "), "."));
+      if (!custom_ops_final.empty()) {
+        if (params.enable_select_tf_ops) {
+          error_msgs.push_back(absl::StrCat(
+              "Some of the operators in the model are not supported by "
+              "the standard TensorFlow Lite runtime and are not recognized "
+              "by "
+              "TensorFlow. If you have a custom "
+              "implementation for them you can disable this error with "
+              "--allow_custom_ops, or by setting allow_custom_ops=True "
+              "when calling tf.lite.TFLiteConverter(). Here is a list "
+              "of builtin operators you are using: ",
+              absl::StrJoin(builtin_ops, ", "),
+              ". Here is a list "
+              "of operators for which you will need custom implementations: ",
+              absl::StrJoin(custom_ops_final, ", "), "."));
+        } else {
+          error_msgs.push_back(absl::StrCat(
+              "Some of the operators in the model are not supported by "
+              "the standard TensorFlow Lite runtime. If those are native "
+              "TensorFlow operators, you might be able to use the extended "
+              "runtime by passing --enable_select_tf_ops, or by setting "
+              "target_ops=TFLITE_BUILTINS,SELECT_TF_OPS when calling "
+              "tf.lite.TFLiteConverter(). Otherwise, if you have a "
+              "custom implementation for them you can disable this error "
+              "with "
+              "--allow_custom_ops, or by setting allow_custom_ops=True "
+              "when calling tf.lite.TFLiteConverter(). Here is a list "
+              "of builtin operators you are using: ",
+              absl::StrJoin(builtin_ops, ", "),
+              ". Here is a list "
+              "of operators for which you will need custom implementations: ",
+              absl::StrJoin(custom_ops_final, ", "), "."));
+        }
       }
-    }
-
-    std::set<string> unsupported_control_flow_ops;
-    // Check if unsupported ops contains control flow ops. It's impossible
-    // to implement these ops as custom ops at the moment.
-    for (const auto& op : custom_ops) {
-      if (IsControlFlowOp(op)) {
-        unsupported_control_flow_ops.insert(op);
+      if (!error_msgs.empty()) {
+        return tensorflow::errors::InvalidArgument(absl::StrCat(
+            please_report_bug_message(), absl::StrJoin(error_msgs, " ")));
       }
     }
-    if (!unsupported_control_flow_ops.empty()) {
-      return tensorflow::errors::InvalidArgument(absl::StrCat(
-          "TensorFlow Lite currently doesn't support control flow ops: ",
-          absl::StrJoin(unsupported_control_flow_ops, ", "), "."));
-    }
   }
 
   if (!unsupported_flex_ops.empty()) {
@@ -585,7 +604,9 @@ tensorflow::Status Export(
                   builder.CreateVector(subgraphs), description, buffers);
   ::tflite::FinishModelBuffer(builder, new_model_location);
 
-  if (params.quantize_weights) {
+  if (params.quantize_weights == QuantizedBufferType::NONE) {
+    WriteModelToString(builder, output_file_contents);
+  } else {
     // Call the quantize_weights tool.
     LOG(INFO) << "Quantizing TFLite model after conversion to flatbuffer. "
                  "dump_graphviz will only output the model before this "
@@ -594,14 +615,21 @@ tensorflow::Status Export(
     flatbuffers::FlatBufferBuilder q_builder(/*initial_size=*/10240);
     const uint8_t* buffer = builder.GetBufferPointer();
     const ::tflite::Model* input_model = ::tflite::GetModel(buffer);
-    if (::tflite::optimize::QuantizeWeights(&q_builder, input_model) !=
-        kTfLiteOk) {
+    ::tflite::optimize::BufferType quantized_type;
+    if (params.quantize_weights == QuantizedBufferType::INT8) {
+      quantized_type = ::tflite::optimize::BufferType::QUANTIZED_INT8;
+    } else if (params.quantize_weights == QuantizedBufferType::FLOAT16) {
+      quantized_type = ::tflite::optimize::BufferType::QUANTIZED_FLOAT16;
+    } else {
+      return tensorflow::errors::InvalidArgument(
+          "Quantized type not recognized");
+    }
+    if (::tflite::optimize::QuantizeWeights(&q_builder, input_model,
+                                            quantized_type) != kTfLiteOk) {
       return tensorflow::errors::InvalidArgument(
           "Quantize weights transformation failed.");
     }
     WriteModelToString(q_builder, output_file_contents);
-  } else {
-    WriteModelToString(builder, output_file_contents);
   }
 
   return tensorflow::Status();
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
index 08d9c956778..3a6031d22b8 100644
--- a/tensorflow/lite/toco/tflite/export.h
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -23,11 +23,13 @@ namespace toco {
 
 namespace tflite {
 
+enum class QuantizedBufferType { NONE, INT8, FLOAT16 };
+
 // The parameters for exporting a TFLite model.
 struct ExportParams {
   bool allow_custom_ops = false;
   bool enable_select_tf_ops = false;
-  bool quantize_weights = false;
+  QuantizedBufferType quantize_weights = QuantizedBufferType::NONE;
 };
 
 // Transform the given tf.mini model into a TF Lite flatbuffer and deposit the
@@ -47,7 +49,8 @@ inline void Export(const Model& model, bool allow_custom_ops,
                    bool quantize_weights, string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = allow_custom_ops;
-  params.quantize_weights = quantize_weights;
+  params.quantize_weights =
+      quantize_weights ? QuantizedBufferType::INT8 : QuantizedBufferType::NONE;
   auto status = Export(model, output_file_contents, params);
   if (!status.ok()) LOG(QFATAL) << status.error_message();
 }
@@ -60,7 +63,8 @@ inline void Export(
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
   ExportParams params;
   params.allow_custom_ops = allow_custom_ops;
-  params.quantize_weights = quantize_weights;
+  params.quantize_weights =
+      quantize_weights ? QuantizedBufferType::INT8 : QuantizedBufferType::NONE;
   auto status = Export(model, output_file_contents, params, ops_by_type);
   if (!status.ok()) LOG(QFATAL) << status.error_message();
 }
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 7d5b59ee661..bbebf46a3b9 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -219,7 +219,7 @@ TEST_F(ExportTest, Export) {
   ExportParams params;
   params.allow_custom_ops = true;
   params.enable_select_tf_ops = false;
-  params.quantize_weights = false;
+  params.quantize_weights = QuantizedBufferType::NONE;
 
   EXPECT_THAT(ExportAndSummarizeOperators(params),
               ElementsAre("builtin:ADD", "builtin:CONV_2D", "custom:MyCrazyOp",
@@ -227,6 +227,123 @@ TEST_F(ExportTest, Export) {
   EXPECT_THAT(ExportAndGetOperatorIndices(params), ElementsAre(1, 0, 2, 3));
 }
 
+TEST_F(ExportTest, UnsupportedControlFlowErrors) {
+  AddOperatorsByName({"Conv", "Add", "Switch", "Merge"});
+
+  ExportParams params;
+  params.allow_custom_ops = false;
+
+  // The model contains control flow ops which are not convertible, so we should
+  // check the returned error message.
+
+  string output;
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  auto status = Export(input_model_, &output, params, ops_by_type);
+  EXPECT_EQ(status.error_message(),
+            "We are continually in the process of adding support to TensorFlow "
+            "Lite for more ops. It would be helpful if you could inform us of "
+            "how this conversion went by opening a github issue at "
+            "https://github.com/tensorflow/tensorflow/issues/"
+            "new?template=40-tflite-op-request.md\n and pasting the "
+            "following:\n\nTensorFlow Lite currently doesn't support control "
+            "flow ops: Merge, Switch. We are working on supporting control "
+            "flow ops, please see github issue at "
+            "https://github.com/tensorflow/tensorflow/issues/28485.");
+}
+
+TEST_F(ExportTest, UnsupportedOpsAndNeedEnableFlex) {
+  AddOperatorsByName({"Conv", "Add", "BatchNormWithGlobalNormalization"});
+
+  ExportParams params;
+  params.allow_custom_ops = false;
+  params.enable_select_tf_ops = false;
+
+  string output;
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  auto status = Export(input_model_, &output, params, ops_by_type);
+  EXPECT_EQ(
+      status.error_message(),
+      "We are continually in the process of adding support to TensorFlow Lite "
+      "for more ops. It would be helpful if you could inform us of how this "
+      "conversion went by opening a github issue at "
+      "https://github.com/tensorflow/tensorflow/issues/"
+      "new?template=40-tflite-op-request.md\n and pasting the "
+      "following:\n\nSome of the operators in the model are not supported by "
+      "the standard TensorFlow Lite runtime. If those are native TensorFlow "
+      "operators, you might be able to use the extended runtime by passing "
+      "--enable_select_tf_ops, or by setting "
+      "target_ops=TFLITE_BUILTINS,SELECT_TF_OPS when calling "
+      "tf.lite.TFLiteConverter(). Otherwise, if you have a custom "
+      "implementation for them you can disable this error with "
+      "--allow_custom_ops, or by setting allow_custom_ops=True when calling "
+      "tf.lite.TFLiteConverter(). Here is a list of builtin operators you are "
+      "using: ADD, CONV_2D. Here is a list of operators for which you will "
+      "need custom implementations: BatchNormWithGlobalNormalization.");
+}
+
+TEST_F(ExportTest, UnsupportedOpsNeedCustomImplementation) {
+  AddOperatorsByName({"Conv", "Add", "MyCustomOp1", "MyCustomOp2"});
+
+  ExportParams params;
+  params.allow_custom_ops = false;
+  params.enable_select_tf_ops = true;
+
+  string output;
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  auto status = Export(input_model_, &output, params, ops_by_type);
+  EXPECT_EQ(
+      status.error_message(),
+      "We are continually in the process of adding support to TensorFlow Lite "
+      "for more ops. It would be helpful if you could inform us of how this "
+      "conversion went by opening a github issue at "
+      "https://github.com/tensorflow/tensorflow/issues/"
+      "new?template=40-tflite-op-request.md\n and pasting the "
+      "following:\n\nSome of the operators in the model are not supported by "
+      "the standard TensorFlow Lite runtime and are not recognized by "
+      "TensorFlow. If you have a custom implementation for them you can "
+      "disable this error with --allow_custom_ops, or by setting "
+      "allow_custom_ops=True when calling tf.lite.TFLiteConverter(). Here is a "
+      "list of builtin operators you are using: ADD, CONV_2D. Here is a list "
+      "of operators for which you will need custom implementations: "
+      "MyCustomOp1, MyCustomOp2.");
+}
+
+TEST_F(ExportTest, UnsupportedControlFlowAndCustomOpsErrors) {
+  AddOperatorsByName(
+      {"Conv", "Add", "Switch", "Merge", "MyCustomOp1", "MyCustomOp2"});
+
+  ExportParams params;
+  params.allow_custom_ops = false;
+
+  // The model contains control flow ops which are not convertible, so we should
+  // check the returned error message.
+
+  string output;
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  auto status = Export(input_model_, &output, params, ops_by_type);
+  EXPECT_EQ(
+      status.error_message(),
+      "We are continually in the process of adding support to TensorFlow Lite "
+      "for more ops. It would be helpful if you could inform us of how this "
+      "conversion went by opening a github issue at "
+      "https://github.com/tensorflow/tensorflow/issues/"
+      "new?template=40-tflite-op-request.md\n and pasting the "
+      "following:\n\nTensorFlow Lite currently doesn't support control flow "
+      "ops: Merge, Switch. We are working on supporting control flow ops, "
+      "please see github issue at "
+      "https://github.com/tensorflow/tensorflow/issues/28485. Some of the "
+      "operators in the model are not supported by the standard TensorFlow "
+      "Lite runtime. If those are native TensorFlow operators, you might be "
+      "able to use the extended runtime by passing --enable_select_tf_ops, or "
+      "by setting target_ops=TFLITE_BUILTINS,SELECT_TF_OPS when calling "
+      "tf.lite.TFLiteConverter(). Otherwise, if you have a custom "
+      "implementation for them you can disable this error with "
+      "--allow_custom_ops, or by setting allow_custom_ops=True when calling "
+      "tf.lite.TFLiteConverter(). Here is a list of builtin operators you are "
+      "using: ADD, CONV_2D. Here is a list of operators for which you will "
+      "need custom implementations: MyCustomOp1, MyCustomOp2.");
+}
+
 TEST_F(ExportTest, QuantizeWeights) {
   // Sanity check for quantize_weights parameter.
   BuildQuantizableTestModel();
@@ -249,7 +366,7 @@ class OpSetsTest : public ExportTest {
     import_all_ops_as_unsupported_ = true;
     params_.allow_custom_ops = false;
     params_.enable_select_tf_ops = false;
-    params_.quantize_weights = false;
+    params_.quantize_weights = QuantizedBufferType::NONE;
 
     for (const OpSet& i : sets) {
       switch (i) {
@@ -335,7 +452,7 @@ TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
 
 // This test is based on a hypothetical scenario that dilation is supported
 // only in Conv version 2. So Toco populates version=1 when dialation
-// parameters are all 1, and version=2 otehrwise.
+// parameters are all 1, and version=2 otherwise.
 class FakeConvolutionOperator
     : public BuiltinOperator<ConvOperator, ::tflite::Conv2DOptions,
                              ::tflite::BuiltinOptions_Conv2DOptions> {
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 0b7a82f6c73..c27aba24850 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 // TODO(ycling): Consider refactoring to extract the LSTM definition out of
 // graph_transformation module.
+#include "tensorflow/lite/delegates/flex/whitelisted_flex_ops.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
 #include "tensorflow/lite/toco/model.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "tensorflow/lite/toco/tflite/custom_operator.h"
 #include "tensorflow/lite/toco/tflite/simple_operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
-#include "tensorflow/lite/toco/tflite/whitelisted_flex_ops.h"
 
 namespace toco {
 
@@ -2646,7 +2646,7 @@ bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
     return false;
   }
 
-  if (!IsWhitelistedFlexOp(tensorflow_op_name)) {
+  if (!::tflite::flex::IsWhitelistedFlexOp(tensorflow_op_name)) {
     LOG(WARNING) << "Op " << tensorflow_op_name
                  << " is a valid TensorFlow op but has not been whitelisted for"
                     " the TensorFlow Lite flex op set.";
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
index 7d525ae5583..c36b3de7748 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -158,6 +158,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.split_tflite_lstm_inputs.default_value(),
            "Split the LSTM inputs from 5 tensors to 18 tensors for TFLite. "
            "Ignored if the output format is not TFLite."),
+      Flag("quantize_to_float16", parsed_flags.quantize_to_float16.bind(),
+           parsed_flags.quantize_to_float16.default_value(),
+           "Used in conjuction with post_training_quantize. Specifies that "
+           "the weights should be quantized to fp16 instead of the default "
+           "(int8)"),
       Flag("quantize_weights", parsed_flags.quantize_weights.bind(),
            parsed_flags.quantize_weights.default_value(),
            "Deprecated. Please use --post_training_quantize instead."),
@@ -266,6 +271,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   READ_TOCO_FLAG(dedupe_array_min_size_bytes, FlagRequirement::kNone);
   READ_TOCO_FLAG(split_tflite_lstm_inputs, FlagRequirement::kNone);
   READ_TOCO_FLAG(quantize_weights, FlagRequirement::kNone);
+  READ_TOCO_FLAG(quantize_to_float16, FlagRequirement::kNone);
   READ_TOCO_FLAG(post_training_quantize, FlagRequirement::kNone);
   READ_TOCO_FLAG(enable_select_tf_ops, FlagRequirement::kNone);
   READ_TOCO_FLAG(force_select_tf_ops, FlagRequirement::kNone);
diff --git a/tensorflow/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
index cb015ba3d2a..50e9d332749 100644
--- a/tensorflow/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 syntax = "proto2";
-import "tensorflow/lite/toco/types.proto";
 
 package toco;
 
+import "tensorflow/lite/toco/types.proto";
+
 // Supported I/O file formats. Some formats may be input-only or output-only.
 enum FileFormat {
   FILE_FORMAT_UNKNOWN = 0;
@@ -37,7 +38,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 29.
+// Next ID to use: 30.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -205,4 +206,10 @@ message TocoFlags {
   // `force_select_tf_ops` should always be used with `enable_select_tf_ops`.
   // WARNING: Experimental interface, subject to change
   optional bool force_select_tf_ops = 28 [default = false];
+
+  // Boolean indicating whether to convert float32 constant buffers to
+  // float16. This is typically done to reduce model size. Delegates may also
+  // wish to implement kernels on reduced precision floats for performance
+  // gains.
+  optional bool quantize_to_float16 = 29 [default = false];
 }
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index c66ef1db915..3257fcdf2f8 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -20,17 +20,18 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/allocate_transient_arrays.h"
 #include "tensorflow/lite/toco/dump_graphviz.h"
 #include "tensorflow/lite/toco/export_tensorflow.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/import_tensorflow.h"
+#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/tflite/export.h"
 #include "tensorflow/lite/toco/tflite/import.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 namespace {
@@ -449,8 +450,13 @@ tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
       params.enable_select_tf_ops =
           toco_flags.force_select_tf_ops() || toco_flags.enable_select_tf_ops();
       params.allow_custom_ops = allow_custom_ops;
-      params.quantize_weights = toco_flags.post_training_quantize();
-
+      if (toco_flags.post_training_quantize()) {
+        if (toco_flags.quantize_to_float16()) {
+          params.quantize_weights = tflite::QuantizedBufferType::FLOAT16;
+        } else {
+          params.quantize_weights = tflite::QuantizedBufferType::INT8;
+        }
+      }
       auto status = toco::tflite::Export(model, output_file_contents, params);
       if (!status.ok()) {
         LOG(ERROR) << status.error_message();
diff --git a/tensorflow/lite/tools/accuracy/BUILD b/tensorflow/lite/tools/accuracy/BUILD
index 26e4cf85aae..7484b2e424c 100644
--- a/tensorflow/lite/tools/accuracy/BUILD
+++ b/tensorflow/lite/tools/accuracy/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
diff --git a/tensorflow/lite/tools/accuracy/accuracy_eval_stage.h b/tensorflow/lite/tools/accuracy/accuracy_eval_stage.h
deleted file mode 100644
index 5a2ba3d2a7a..00000000000
--- a/tensorflow/lite/tools/accuracy/accuracy_eval_stage.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
-
-#include <vector>
-
-#include "tensorflow/core/framework/tensor.h"
-
-namespace tensorflow {
-namespace metrics {
-
-// Base class for evaluation stage that evaluates the accuracy of the model.
-// This stage calculates the accuracy metrics given the model outputs and
-// expected ground truth.
-class AccuracyEval {
- public:
-  AccuracyEval() = default;
-  AccuracyEval(const AccuracyEval&) = delete;
-  AccuracyEval& operator=(const AccuracyEval&) = delete;
-
-  AccuracyEval(const AccuracyEval&&) = delete;
-  AccuracyEval& operator=(const AccuracyEval&&) = delete;
-
-  virtual ~AccuracyEval() = default;
-
-  // Evaluates the accuracy of the model for given `model_outputs` and the
-  // `ground truth`.
-  // Derived classes can do additional book keeping, calculate aggregrate
-  // statistics etc for the given model.
-  virtual Status ComputeEval(const std::vector<Tensor>& model_outputs,
-                             const Tensor& ground_truth) = 0;
-};
-}  //  namespace metrics
-}  //  namespace tensorflow
-#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/BUILD b/tensorflow/lite/tools/accuracy/ilsvrc/BUILD
index 88162ac810c..cb5f7ca6cf0 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/BUILD
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
index 7d53fe77695..77526de217c 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/README.md
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
@@ -39,14 +39,32 @@ The binary takes the following parameters:
 and the following optional parameters:
 
 *   `blacklist_file_path`: `string` \
-    Path to blacklist file. This file contains the indices of images that are blacklisted for evaluation. 1762 images are blacklisted in ILSVRC dataset. For details please refer to readme.txt of ILSVRC2014 devkit.
+    Path to blacklist file. This file contains the indices of images that are
+    blacklisted for evaluation. 1762 images are blacklisted in ILSVRC dataset.
+    For details please refer to readme.txt of ILSVRC2014 devkit.
 
 *   `num_images`: `int` (default=0) \
-    The number of images to process, if 0, all images in the directory are processed otherwise only num_images will be processed.
+    The number of images to process, if 0, all images in the directory are
+    processed otherwise only num_images will be processed.
 
 *   `num_threads`: `int` (default=4) \
-    The number of threads to use for evaluation.
+    The number of threads to use for evaluation. Note: This does not change the
+    number of TFLite Interpreter threads, but shards the dataset to speed up
+    evaluation.
 
+*   `proto_output_file_path`: `string` \
+    Optionally, the computed accuracies can be output to a file as a
+    string-serialized instance of tflite::evaluation::TopkAccuracyEvalMetrics.
+
+The following optional parameters can be used to modify the inference runtime:
+
+*   `num_interpreter_threads`: `int` (default=1) \
+    This modifies the number of threads used by the TFLite Interpreter for
+    inference.
+
+*   `delegate`: `string` \
+    If provided, tries to use the specified delegate for accuracy evaluation.
+    Valid values: "nnapi", "gpu".
 
 ## Downloading ILSVRC
 In order to use this tool to run evaluation on the full 50K ImageNet dataset,
@@ -78,7 +96,6 @@ python generate_validation_labels.py -- \
 ```
 bazel build -c opt \
   --config=android_arm \
-  --config=monolithic \
   --cxxopt='--std=c++11' \
   --copt=-D__ANDROID_TYPES_FULL__ \
   --copt=-DSUPPORT_SELECTIVE_REGISTRATION \
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
index 48c272089fb..9aeb5f32b92 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <iomanip>
 #include <memory>
 #include <mutex>  // NOLINT(build/c++11)
+#include <ostream>
 #include <string>
 
 #include "absl/memory/memory.h"
@@ -36,6 +37,7 @@ using ::tflite::evaluation::TopkAccuracyEvalMetrics;
 
 constexpr char kNumThreadsFlag[] = "num_threads";
 constexpr char kOutputFilePathFlag[] = "output_file_path";
+constexpr char kProtoOutputFilePathFlag[] = "proto_output_file_path";
 
 // TODO(b/130823599): Move to tools/evaluation/stages/topk_accuracy_eval_stage.
 // Computes total number of images processed & aggregates Top-K accuracies
@@ -87,6 +89,8 @@ class ResultsWriter : public ImagenetModelEvaluator::Observer {
                                        const TopkAccuracyEvalMetrics& metrics,
                                        const string& image) override;
 
+  TopkAccuracyEvalMetrics AggregatedMetrics();
+
  private:
   // For writing to CSV.
   int k_;
@@ -143,14 +147,31 @@ void ResultsWriter::OnSingleImageEvaluationComplete(
   }
 }
 
+TopkAccuracyEvalMetrics ResultsWriter::AggregatedMetrics() {
+  std::lock_guard<std::mutex> lock(mu_);
+  int num_evaluated;
+  std::vector<double> total_accuracies;
+  AggregateAccuraciesAndNumImages(k_, shard_id_accuracy_metrics_map_,
+                                  shard_id_done_image_count_map_,
+                                  &total_accuracies, &num_evaluated);
+  TopkAccuracyEvalMetrics aggregated_metrics;
+  for (auto accuracy : total_accuracies) {
+    aggregated_metrics.add_topk_accuracies(accuracy);
+  }
+  return aggregated_metrics;
+}
+
 int Main(int argc, char* argv[]) {
-  string output_file_path;
+  std::string output_file_path, proto_output_file_path;
   int num_threads = 4;
   std::vector<tflite::Flag> flag_list = {
       tflite::Flag::CreateFlag(kNumThreadsFlag, &num_threads,
                                "Number of threads."),
       tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path,
                                "Path to output file."),
+      tflite::Flag::CreateFlag(kProtoOutputFilePathFlag,
+                               &proto_output_file_path,
+                               "Path to proto output file."),
   };
   tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
 
@@ -190,6 +211,15 @@ int Main(int argc, char* argv[]) {
   evaluator->AddObserver(&results_writer);
   LOG(ERROR) << "Starting evaluation with: " << num_threads << " threads.";
   evaluator->EvaluateModel();
+
+  if (!proto_output_file_path.empty()) {
+    std::ofstream proto_out_file(proto_output_file_path,
+                                 std::ios::out | std::ios::binary);
+    TopkAccuracyEvalMetrics metrics = results_writer.AggregatedMetrics();
+    proto_out_file << metrics.SerializeAsString();
+    proto_out_file.close();
+  }
+
   return 0;
 }
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index ecbd8a7234c..4e5280f982e 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -35,12 +35,18 @@ limitations under the License.
 
 namespace {
 
+using tflite::evaluation::TfliteInferenceParams;
+
 constexpr char kNumImagesFlag[] = "num_images";
 constexpr char kModelOutputLabelsFlag[] = "model_output_labels";
 constexpr char kGroundTruthImagesPathFlag[] = "ground_truth_images_path";
 constexpr char kGroundTruthLabelsFlag[] = "ground_truth_labels";
 constexpr char kBlacklistFilePathFlag[] = "blacklist_file_path";
 constexpr char kModelFileFlag[] = "model_file";
+constexpr char kInterpreterThreadsFlag[] = "num_interpreter_threads";
+constexpr char kDelegateFlag[] = "delegate";
+constexpr char kNnapiDelegate[] = "nnapi";
+constexpr char kGpuDelegate[] = "gpu";
 
 template <typename T>
 std::vector<T> GetFirstN(const std::vector<T>& v, int n) {
@@ -132,7 +138,14 @@ class CompositeObserver : public ImagenetModelEvaluator::Observer {
           "integer that is "
           "equal to index number of blacklisted image."),
       tflite::Flag::CreateFlag(kModelFileFlag, &params.model_file_path,
-                               "Path to test tflite model file.")};
+                               "Path to test tflite model file."),
+      tflite::Flag::CreateFlag(
+          kInterpreterThreadsFlag, &params.num_interpreter_threads,
+          "Number of interpreter threads to use for inference."),
+      tflite::Flag::CreateFlag(kDelegateFlag, &params.delegate,
+                               "Delegate to use for inference, if available. "
+                               "Must be one of {'nnapi', 'gpu'}"),
+  };
   tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
 
   if (params.number_of_images < 0) {
@@ -162,6 +175,12 @@ TfLiteStatus EvaluateModelForShard(const uint64_t shard_id,
                                     ->mutable_image_classification_params();
   auto* inference_params = classification_params->mutable_inference_params();
   inference_params->set_model_file_path(params.model_file_path);
+  inference_params->set_num_threads(params.num_interpreter_threads);
+  if (params.delegate == kNnapiDelegate) {
+    inference_params->set_delegate(TfliteInferenceParams::NNAPI);
+  } else if (params.delegate == kGpuDelegate) {
+    inference_params->set_delegate(TfliteInferenceParams::GPU);
+  }
   classification_params->mutable_topk_accuracy_eval_params()->set_k(num_ranks);
 
   tflite::evaluation::ImageClassificationStage eval(eval_config);
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index 3b8d19331c3..55d0b3b680d 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -62,6 +62,10 @@ class ImagenetModelEvaluator {
     // This file is a list of image indices in a sorted order.
     std::string blacklist_file_path;
 
+    // Delegate used to perform inference (if available).
+    // Valid values: 'nnapi', 'gpu'.
+    std::string delegate;
+
     // The maximum number of images to calculate accuracy.
     // 0 means all images, a positive number means only the specified
     // number of images.
@@ -69,6 +73,9 @@ class ImagenetModelEvaluator {
 
     // Number of ranks, top K.
     int num_ranks = 10;
+
+    // Number of interpreter threads.
+    int num_interpreter_threads = 1;
   };
 
   // An evaluation observer.
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index c692b948692..74b3dad9bdd 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -1,8 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 0d2f793cbec..b5d7cdeb0ba 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -13,9 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 
diff --git a/tensorflow/lite/tools/evaluation/proto/BUILD b/tensorflow/lite/tools/evaluation/proto/BUILD
index fd1f0209f3e..c25e35eeafc 100644
--- a/tensorflow/lite/tools/evaluation/proto/BUILD
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -13,9 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 proto_library(
     name = "evaluation_stages_proto",
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 6a642550a8a..05d5c66e21d 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -13,9 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 
@@ -105,6 +106,7 @@ cc_library(
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:time",
         "//tensorflow/lite/tools/evaluation:evaluation_stage",
@@ -123,6 +125,7 @@ cc_test(
     deps = [
         ":tflite_inference_stage",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
index 73d4c3efb7e..31c956b0c90 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/image_classification_stage.h
@@ -58,6 +58,12 @@ class ImageClassificationStage : public EvaluationStage {
     ground_truth_label_ = ground_truth_label;
   }
 
+  // Provides a pointer to the underlying TfLiteInferenceStage.
+  // Returns non-null value only if this stage has been initialized.
+  TfliteInferenceStage* const GetInferenceStage() {
+    return inference_stage_.get();
+  }
+
  private:
   const std::vector<std::string>* all_labels_ = nullptr;
   std::unique_ptr<ImagePreprocessingStage> preprocessing_stage_;
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index bc0eab03338..0f7070d3d05 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -39,6 +39,15 @@ TfLiteModelInfo GetTfliteModelInfo(const Interpreter& interpreter) {
 
 }  // namespace
 
+TfLiteStatus TfliteInferenceStage::ApplyCustomDelegate(
+    TfLiteDelegate* delegate) {
+  if (!interpreter_) {
+    LOG(ERROR) << "Stage not initialized before calling ApplyCustomDelegate";
+    return kTfLiteError;
+  }
+  return interpreter_->ModifyGraphWithDelegate(delegate);
+}
+
 TfLiteStatus TfliteInferenceStage::Init() {
   if (!config_.specification().has_tflite_inference_params()) {
     LOG(ERROR) << "TfliteInferenceParams not provided";
@@ -109,7 +118,7 @@ TfLiteStatus TfliteInferenceStage::Run() {
   // Copy input data.
   for (int i = 0; i < interpreter_->inputs().size(); ++i) {
     TfLiteTensor* tensor = interpreter_->tensor(interpreter_->inputs()[i]);
-    std::memcpy(tensor->data.raw, (*inputs_)[i], tensor->bytes);
+    tensor->data.raw = static_cast<char*>(inputs_->at(i));
   }
 
   // Invoke.
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
index ecb9c7f7a86..fa53a49079f 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
@@ -16,9 +16,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_TOOLS_EVALUATION_STAGES_TFLITE_INFERENCE_STAGE_H_
 
 #include <stdint.h>
+
 #include <vector>
 
 #include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
@@ -50,10 +52,14 @@ class TfliteInferenceStage : public EvaluationStage {
 
   // Call before Run().
   // This class does not take ownership of raw_input_ptrs.
-  void SetInputs(std::vector<void*>& raw_input_ptrs) {
+  void SetInputs(const std::vector<void*>& raw_input_ptrs) {
     inputs_ = &raw_input_ptrs;
   }
 
+  // Applies provided delegate to the underlying TFLite Interpreter.
+  // NOTE: TFLiteInferenceStage does not take ownership of delegate.
+  TfLiteStatus ApplyCustomDelegate(TfLiteDelegate* delegate);
+
   // Read-only view of a TfliteModelInfo. TfliteInferenceStage retains
   // ownership.
   // Only available after Init is done.
@@ -70,7 +76,7 @@ class TfliteInferenceStage : public EvaluationStage {
   std::vector<Interpreter::TfLiteDelegatePtr> delegates_;
 
   TfLiteModelInfo model_info_;
-  std::vector<void*>* inputs_ = nullptr;
+  const std::vector<void*>* inputs_ = nullptr;
   std::vector<void*> outputs_;
 
   tensorflow::Stat<int64_t> latency_stats_;
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
index f86de894713..beb9c49c338 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
@@ -15,10 +15,12 @@ limitations under the License.
 #include "tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h"
 
 #include <stdint.h>
+
 #include <string>
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 
@@ -155,6 +157,19 @@ TEST(TfliteInferenceStage, CorrectOutput) {
       metrics.process_metrics().tflite_inference_metrics().num_inferences(), 2);
 }
 
+TEST(TfliteInferenceStage, CustomDelegate) {
+  // Create stage.
+  EvaluationStageConfig config = GetTfliteInferenceStageConfig();
+  TfliteInferenceStage stage(config);
+
+  TfLiteDelegate* test_delegate = NnApiDelegate();
+
+  // Delegate application should only work after initialization of stage.
+  EXPECT_NE(stage.ApplyCustomDelegate(test_delegate), kTfLiteOk);
+  EXPECT_EQ(stage.Init(), kTfLiteOk);
+  EXPECT_EQ(stage.ApplyCustomDelegate(test_delegate), kTfLiteOk);
+}
+
 }  // namespace
 }  // namespace evaluation
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 1b3057ee242..eb00c120c94 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -111,6 +111,7 @@ $(wildcard tensorflow/lite/experimental/ruy/pack.cc) \
 $(wildcard tensorflow/lite/experimental/ruy/pmu.cc) \
 $(wildcard tensorflow/lite/experimental/ruy/thread_pool.cc) \
 $(wildcard tensorflow/lite/experimental/ruy/trace.cc) \
+$(wildcard tensorflow/lite/experimental/ruy/trmul.cc) \
 $(wildcard tensorflow/lite/experimental/ruy/tune.cc)
 ifneq ($(BUILD_TYPE),micro)
 CORE_CC_ALL_SRCS += \
diff --git a/tensorflow/lite/tools/make/targets/aarch64_makefile.inc b/tensorflow/lite/tools/make/targets/aarch64_makefile.inc
index 673545ef038..0c38463bb6e 100644
--- a/tensorflow/lite/tools/make/targets/aarch64_makefile.inc
+++ b/tensorflow/lite/tools/make/targets/aarch64_makefile.inc
@@ -21,7 +21,8 @@ ifeq ($(TARGET),aarch64)
     -Wl,--no-export-dynamic \
     -Wl,--exclude-libs,ALL \
     -Wl,--gc-sections \
-    -Wl,--as-needed
+    -Wl,--as-needed \
+    -lrt
 
        
   LIBS := \
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 22a473bf9dc..3205277c743 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -1,11 +1,12 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(glob([
     "testdata/*.bin",
@@ -22,6 +23,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_fbs",
+        "//third_party/eigen3",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -104,6 +106,7 @@ cc_library(
         ":quantization_utils",
         ":model_utils",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@flatbuffers",
         "//tensorflow/lite:framework",
         # TODO(suharshs): Move the relevant quantization utils to a non-internal location.
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index efd1261f228..8222631a935 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -1,11 +1,12 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 cc_library(
     name = "calibrator_lib",
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index 80c14fc0676..6ae83bca6fa 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -13,15 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
+
+#include <cmath>
+#include <cstdint>
+
 #include "absl/memory/memory.h"
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
-#include <cmath>
-#include <cstdint>
-
 namespace tflite {
 namespace optimize {
 namespace utils {
@@ -32,9 +34,6 @@ const int8_t kMaxQuantizedValue = 127;
 }  // namespace
 
 TfLiteStatus NumElements(const TensorT& tensor, uint64_t* num_elements) {
-  if (tensor.shape.empty()) {
-    return kTfLiteError;
-  }
   *num_elements = 1;
   for (const uint64_t dim : tensor.shape) {
     *num_elements *= dim;
@@ -197,6 +196,42 @@ TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
   return kTfLiteOk;
 }
 
+TfLiteStatus QuantizeTensorFloat16(ModelT* model, TensorT* tensor) {
+  if (model == nullptr || tensor == nullptr) {
+    return kTfLiteError;
+  }
+
+  BufferT* buffer = model->buffers[tensor->buffer].get();
+  if (buffer == nullptr) {
+    return kTfLiteError;
+  }
+
+  uint64_t num_elements;
+  TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
+
+  // Copy single byte buffer data to float vector to guard against misalignment.
+  std::vector<float> float_vector(num_elements);
+  uint8_t* first = buffer->data.data();
+  std::copy(first, first + buffer->data.size(),
+            reinterpret_cast<uint8_t*>(float_vector.data()));
+
+  // Transform float data to float16.
+  std::vector<Eigen::half> quantized_buffer;
+  quantized_buffer.resize(num_elements);
+  std::transform(
+      float_vector.begin(), float_vector.end(), quantized_buffer.begin(),
+      [](float a) { return Eigen::half_impl::float_to_half_rtne(a); });
+
+  char* half_buffer = reinterpret_cast<char*>(quantized_buffer.data());
+  model->buffers[tensor->buffer]->data.assign(
+      half_buffer, half_buffer + sizeof(Eigen::half) * num_elements);
+
+  // Update the tensor type.
+  tensor->type = TensorType_FLOAT16;
+
+  return kTfLiteOk;
+}
+
 TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
                                    const std::vector<int64_t>& zero_point,
                                    int quantized_dimension,
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.h b/tensorflow/lite/tools/optimize/quantization_utils.h
index 274862a536c..4cc67cfe40a 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.h
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -64,6 +64,9 @@ void SymmetricPerChannelQuantizeValues(const float* const input,
 // of the tensor.
 TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor);
 
+// Quantizes tensor to float16.
+TfLiteStatus QuantizeTensorFloat16(ModelT* model, TensorT* tensor);
+
 // Add quantization parameters.
 TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
                                    const std::vector<int64_t>& zero_point,
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index 74813c180cb..d30df1a47ff 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -56,7 +56,8 @@ TEST(QuantizationUtilsTest, NumElements) {
   EXPECT_EQ(num_elements, 5);
 
   tensor.shape = {};
-  EXPECT_EQ(kTfLiteError, NumElements(tensor, &num_elements));
+  EXPECT_EQ(kTfLiteOk, NumElements(tensor, &num_elements));
+  EXPECT_EQ(num_elements, 1);
 }
 
 TEST(QuantizationUtilsTest, GetAsymmetricQuantizationParamsUnitRange) {
@@ -258,6 +259,38 @@ TEST(QuantizationUtilsTest, SymmetricQuantizeTensor) {
   EXPECT_EQ(quant_buffer_size * 4, float_buffer_size);
 }
 
+TEST(QuantizationUtilsTest, QuantizeFloat16) {
+  // Conv model has weights between 0 and 10.
+  // Quantize the weights tensor.
+  ASSERT_TRUE(g_test_model_dir != nullptr);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadConvModel();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  auto conv_op = subgraph->operators.at(0).get();
+  ASSERT_EQ(model.operator_codes.at(conv_op->opcode_index)->builtin_code,
+            BuiltinOperator_CONV_2D);
+  int32_t weights_tensor_idx = conv_op->inputs[1];
+  TensorT* weights_tensor = subgraph->tensors.at(weights_tensor_idx).get();
+
+  EXPECT_EQ(weights_tensor->type, TensorType_FLOAT32);
+  size_t float_buffer_size =
+      model.buffers.at(weights_tensor->buffer)->data.size();
+
+  EXPECT_EQ(QuantizeTensorFloat16(&model, weights_tensor), kTfLiteOk);
+
+  size_t quant_buffer_size =
+      model.buffers.at(weights_tensor->buffer)->data.size();
+  EXPECT_EQ(weights_tensor->type, TensorType_FLOAT16);
+  EXPECT_EQ(quant_buffer_size * 2, float_buffer_size);
+}
+
 TEST(QuantizationUtilsTest, AddQuantizationParams) {
   // Create data.
   auto model = absl::make_unique<ModelT>();
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 87454630026..6b6ed6cd9dd 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -370,9 +370,6 @@ TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
   OperatorT* op = subgraph->operators[*op_idx].get();
   const BuiltinOperator op_code =
       model->operator_codes[op->opcode_index]->builtin_code;
-  const int32_t tensor_idx = op->inputs[input_idx];
-  TensorT* tensor = subgraph->tensors[tensor_idx].get();
-  const bool is_input_quantized = utils::IsQuantized(subgraph, tensor_idx);
   if (input_idx >= op->inputs.size()) {
     error_reporter->Report(
         "Required input index %d is larger than the input length of op "
@@ -381,6 +378,9 @@ TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
         subgraph_idx);
     return kTfLiteError;
   }
+  const int32_t tensor_idx = op->inputs[input_idx];
+  TensorT* tensor = subgraph->tensors[tensor_idx].get();
+  const bool is_input_quantized = utils::IsQuantized(subgraph, tensor_idx);
   if (property.quantizable && !is_input_quantized) {
     // The operation is quantizable, but the input isn't yet quantized.
     if (utils::HasBuffer(model, subgraph, tensor_idx)) {
@@ -418,8 +418,7 @@ TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
     } else {
       error_reporter->Report(
           "Unable to find buffer or min/max value for input activation "
-          "%d "
-          "in %s in subgraph %d, node: %d",
+          "%d in %s in subgraph %d, node: %d",
           input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
       return kTfLiteError;
     }
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index dda825393a5..89965e1190e 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/context.h"
@@ -172,7 +173,7 @@ bool CheckAllOpInputsQuantized(const SubGraphT* subgraph, const OperatorT* op,
 TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     const ModelT* model, const OperatorT* op, uint64_t weights_min_num_elements,
     const CustomOpMap& custom_op_map,
-    std::unordered_map<int32_t, TensorT*>* tensor_map) {
+    absl::flat_hash_map<int32_t, TensorT*>* tensor_map) {
   SubGraphT* subgraph = model->subgraphs.at(0).get();
   const OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
 
@@ -328,11 +329,11 @@ PassQuantizationAndGetConsumers(
       GetTensorConsumers(model, subgraph, output_tensor_idx));
 }
 
-TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
-                                     const Model* input_model,
-                                     bool use_hybrid_evaluation,
-                                     uint64_t weights_min_num_elements,
-                                     const CustomOpMap& custom_op_map) {
+TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
+                                 const Model* input_model,
+                                 bool use_hybrid_evaluation,
+                                 uint64_t weights_min_num_elements,
+                                 const CustomOpMap& custom_op_map) {
   std::unique_ptr<ModelT> model;
   model.reset(input_model->UnPack());
 
@@ -345,15 +346,14 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
 
   SubGraphT* subgraph = model->subgraphs.at(0).get();
 
-  std::vector<std::unique_ptr<OperatorT>> new_operators;
-  std::unordered_map<int32_t, TensorT*> tensor_map;
+  absl::flat_hash_map<int32_t, TensorT*> tensor_map;
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
     TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
         model.get(), op, weights_min_num_elements, custom_op_map, &tensor_map));
   }
 
-  // The unordered_map ensures that we quantize each tensor exactly once.
+  // The hash map ensures that we quantize each tensor exactly once.
   // TODO(suharshs): This map key isn't sufficient when we support multiple
   // subgraphs.
   for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
@@ -396,7 +396,7 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
       }
     }
 
-    // Check that this tensor is an output tensor.
+    // Check if this tensor is an output tensor.
     int32_t output_index = -1;
     for (int32_t i = 0; i < subgraph->outputs.size(); ++i) {
       if (subgraph->outputs[i] == tensor_idx) {
@@ -424,8 +424,6 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
     utils::MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
                                   dequantize_output_idx);
 
-    LOG(INFO) << "Creating Dequantize op with name " << dequant_name << ".";
-
     // Update the op_input of all the ops that need the created dequantize
     // operation.
     int32_t min_op_idx = subgraph->operators.size();
@@ -455,6 +453,81 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
   return kTfLiteOk;
 }
 
+TfLiteStatus QuantizeWeightsFloat16(flatbuffers::FlatBufferBuilder* builder,
+                                    const Model* input_model) {
+  std::unique_ptr<ModelT> model;
+  model.reset(input_model->UnPack());
+
+  // TODO(suharshs): When models support multiple subgraphs, add support.
+  if (model->subgraphs.size() != 1) {
+    LOG(ERROR) << "Quantize weights tool only supports tflite models with one "
+                  "subgraph.";
+    return kTfLiteError;
+  }
+
+  SubGraphT* subgraph = model->subgraphs.at(0).get();
+
+  absl::flat_hash_map<int32_t, TensorT*> tensor_map;
+  for (int i = 0; i < subgraph->operators.size(); ++i) {
+    OperatorT* op = subgraph->operators[i].get();
+    for (auto tensor_idx : op->inputs) {
+      TensorT* tensor = subgraph->tensors[tensor_idx].get();
+      BufferT* buffer = model->buffers[tensor->buffer].get();
+      if (buffer == nullptr) {
+        return kTfLiteError;
+      }
+      // Quantize tensors that have data to quantize.
+      bool is_constant = !model->buffers[tensor->buffer].get()->data.empty();
+      if (tensor->type == TensorType_FLOAT32 && is_constant) {
+        tensor_map.insert({tensor_idx, tensor});
+      }
+    }
+  }
+
+  // The hash map ensures that we quantize each tensor exactly once.
+  for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
+    // Quantize the tensor.
+    TF_LITE_ENSURE_STATUS(
+        utils::QuantizeTensorFloat16(model.get(), tensor_pair.second));
+
+    int32_t tensor_idx = tensor_pair.first;
+    TensorT* tensor = tensor_pair.second;
+    std::vector<ConsumerOpInfo> dequant_op_infos =
+        GetTensorConsumers(model.get(), subgraph, tensor_idx);
+
+    // Create a new tensor to be the output of the dequantize op.
+    std::unique_ptr<TensorT> dequantize_output;
+    const string dequant_name = tensor->name + "_dequantize";
+    utils::MakeTensor(dequant_name, tensor->shape, TensorType_FLOAT32,
+                      &dequantize_output);
+    const int32_t dequantize_output_idx = subgraph->tensors.size();
+    subgraph->tensors.push_back(std::move(dequantize_output));
+
+    // Create the Dequantize operation.
+    std::unique_ptr<OperatorT> dequantize_op;
+    utils::MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
+                                  dequantize_output_idx);
+
+    // Update the op_input of all the ops that need the created dequantize
+    // operation.
+    int32_t min_op_idx = subgraph->operators.size();
+    for (ConsumerOpInfo& dequant_op_info : dequant_op_infos) {
+      dequant_op_info.op->inputs[dequant_op_info.op_input_idx] =
+          dequantize_output_idx;
+      min_op_idx = std::min(dequant_op_info.op_idx, min_op_idx);
+    }
+
+    // Insert the newly created Dequantize operation before the earliest
+    // consumer, since TFLite requires operators to be topo-sorted.
+    subgraph->operators.insert(subgraph->operators.begin() + min_op_idx,
+                               std::move(dequantize_op));
+  }
+
+  flatbuffers::Offset<Model> output_model_location =
+      Model::Pack(*builder, model.get());
+  FinishModelBuffer(*builder, output_model_location);
+  return kTfLiteOk;
+}
 }  // namespace
 
 namespace internal {
@@ -465,8 +538,8 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
   // By default we require that only weights with more than
   // kWeightsMinSizeDefault elements are quantized.
   CustomOpMap custom_op_map;
-  return QuantizeWeightsInternal(builder, input_model, use_hybrid_evaluation,
-                                 weights_min_num_elements, custom_op_map);
+  return QuantizeWeightsInt8(builder, input_model, use_hybrid_evaluation,
+                             weights_min_num_elements, custom_op_map);
 }
 }  // namespace internal
 
@@ -474,25 +547,31 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              const Model* input_model,
                              uint64_t weights_min_num_elements) {
   CustomOpMap custom_op_map;
-  return QuantizeWeightsInternal(builder, input_model, true,
-                                 weights_min_num_elements, custom_op_map);
+  return QuantizeWeightsInt8(builder, input_model, true,
+                             weights_min_num_elements, custom_op_map);
 }
 
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
-                             const Model* input_model) {
-  // By default we require that only weights with more than
-  // kWeightsMinSizeDefault elements are quantized.
-  CustomOpMap custom_op_map;
-  return QuantizeWeightsInternal(builder, input_model, true,
+                             const Model* input_model, BufferType quant_type) {
+  switch (quant_type) {
+    case BufferType::QUANTIZED_INT8: {
+      // By default we require that only weights with more than
+      // kWeightsMinSizeDefault elements are quantized.
+      CustomOpMap custom_op_map;
+      return QuantizeWeightsInt8(builder, input_model, true,
                                  kWeightsMinNumElementsDefault, custom_op_map);
+    }
+    case BufferType::QUANTIZED_FLOAT16:
+      return QuantizeWeightsFloat16(builder, input_model);
+  }
 }
 
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              const Model* input_model,
                              uint64_t weights_min_num_elements,
                              const CustomOpMap& custom_op_map) {
-  return QuantizeWeightsInternal(builder, input_model, true,
-                                 weights_min_num_elements, custom_op_map);
+  return QuantizeWeightsInt8(builder, input_model, true,
+                             weights_min_num_elements, custom_op_map);
 }
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.h b/tensorflow/lite/tools/optimize/quantize_weights.h
index 62f9584011e..528614f0b7b 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.h
+++ b/tensorflow/lite/tools/optimize/quantize_weights.h
@@ -26,6 +26,9 @@ limitations under the License.
 namespace tflite {
 namespace optimize {
 
+// Supported resulting types from quantization process.
+enum class BufferType { QUANTIZED_INT8, QUANTIZED_FLOAT16 };
+
 // Quantizes input_model and populates the provided builder with the new model.
 // By default only weights tensors weight more than 1024 elements will be
 // quantized.
@@ -33,8 +36,9 @@ namespace optimize {
 // A tflite::Model can be obtained from the builder with:
 //   const uint8_t* buffer = builder->GetBufferPointer();
 //   tflite::Model* model = GetModel(buffer);
-TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
-                             const Model* input_model);
+TfLiteStatus QuantizeWeights(
+    flatbuffers::FlatBufferBuilder* builder, const Model* input_model,
+    BufferType quant_type = BufferType::QUANTIZED_INT8);
 
 // Same as above, but only weights with greater than or equal
 // weights_min_num_elements elements will be quantized.
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index e1cd031cc6d..c35259ef437 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -272,7 +272,7 @@ TEST_F(QuantizeWeightsTest, DequantizeConv) {
       } else if (quant_tensor->name()->str() == "conv_bias") {
         EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
       } else if (quant_tensor->buffer() != 0) {
-        // If its a non-bias constant tensor, is must be the weight.
+        // If it's a non-bias constant tensor, it must be the weight.
         EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
       } else {
         EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
@@ -281,6 +281,65 @@ TEST_F(QuantizeWeightsTest, DequantizeConv) {
   }
 }
 
+TEST_F(QuantizeWeightsTest, DequantizeConvFloat16) {
+  LoadBasicModel();
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = tflite::optimize::QuantizeWeights(
+      &builder, model_, BufferType::QUANTIZED_FLOAT16);
+  EXPECT_EQ(status, kTfLiteOk);
+
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    // The output graph should have two extra tensors from the added dequantize
+    // op.
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size() + 2);
+    // Check that a dequantize op exists.
+    int32_t dequant_input_idx = -1;
+    int32_t dequant_output_idx = -1;
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      if (output_model->operator_codes()->Get(op_code_idx)->builtin_code() ==
+          BuiltinOperator_DEQUANTIZE) {
+        dequant_input_idx = op->inputs()->Get(0);
+        dequant_output_idx = op->outputs()->Get(0);
+      }
+    }
+    ASSERT_GT(dequant_input_idx, -1);
+    ASSERT_GT(dequant_output_idx, -1);
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); ++i) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      // If the tensor is a weight, it should have type FLOAT16.
+      // If the tensor is a bias, it should have type FLOAT16.
+      // If the tensor is an input or output it should have type FLOAT32.
+      // The input to dequantize should be FLOAT16, and all other tensors should
+      // be FLOAT32.
+      if (i == dequant_input_idx) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT16);
+      } else if (i == dequant_output_idx) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (IsModelInputOrOutput(output_model, i)) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->name()->str() == "conv_bias") {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT16);
+      } else if (quant_tensor->buffer() != 0) {
+        // If it's a non-bias constant tensor, it must be the weight.
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT16);
+      } else {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      }
+    }
+  }
+}
+
 TEST_F(QuantizeWeightsTest, SharedWeights_Hybrid) {
   LoadSharedWeightsModel();
   flatbuffers::FlatBufferBuilder builder;
diff --git a/tensorflow/lite/tutorials/BUILD b/tensorflow/lite/tutorials/BUILD
index 3b44395212a..657fbcd19ef 100644
--- a/tensorflow/lite/tutorials/BUILD
+++ b/tensorflow/lite/tutorials/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_binary(
diff --git a/tensorflow/lite/tutorials/full_integer_post_training_quant.ipynb b/tensorflow/lite/tutorials/full_integer_post_training_quant.ipynb
new file mode 100644
index 00000000000..27e02453c10
--- /dev/null
+++ b/tensorflow/lite/tutorials/full_integer_post_training_quant.ipynb
@@ -0,0 +1,651 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "full-integer-post-training-quant.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "private_outputs": true,
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 2",
+      "name": "python2"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "6Y8E0lw5eYWm"
+      },
+      "source": [
+        "# Full Integer Post Training Quantization"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "CIGrZZPTZVeO"
+      },
+      "source": [
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "BTC1rDAuei_1"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "[TensorFlow Lite](https://www.tensorflow.org/lite/) now supports\n",
+        "converting an entire model (weights and activations) to 8-bit during model conversion from TensorFlow to TensorFlow Lite's flat buffer format. This results in a 4x reduction in model size and a 3 to 4x performance improvement on CPU performance. In addition, this fully quantized model can be consumed by integer-only hardware accelerators.\n",
+        "\n",
+        "In contrast to [post-training \"on-the-fly\" quantization](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb)\n",
+        ", which only stores weights as 8-bit ints, in this technique all weights *and* activations are quantized statically during model conversion.\n",
+        "\n",
+        "In this tutorial, we train an MNIST model from scratch, check its accuracy in TensorFlow, and then convert the saved model into a Tensorflow Lite flatbuffer\n",
+        "with full quantization. We finally check the\n",
+        "accuracy of the converted model and compare it to the original saved model. We\n",
+        "run the training script [mnist.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist.py) from\n",
+        "[Tensorflow official MNIST tutorial](https://github.com/tensorflow/models/tree/master/official/mnist).\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2XsEP17Zelz9"
+      },
+      "source": [
+        "## Building an MNIST model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "dDqqUIZjZjac"
+      },
+      "source": [
+        "### Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "gyqAw1M9lyab",
+        "colab": {}
+      },
+      "source": [
+        "! pip uninstall -y tensorflow\n",
+        "! pip install -U tf-nightly"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "WsN6s5L1ieNl",
+        "colab": {}
+      },
+      "source": [
+        "import tensorflow as tf\n",
+        "tf.enable_eager_execution()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "00U0taBoe-w7",
+        "colab": {}
+      },
+      "source": [
+        "! git clone --depth 1 https://github.com/tensorflow/models"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "4XZPtSh-fUOc",
+        "colab": {}
+      },
+      "source": [
+        "import sys\n",
+        "import os\n",
+        "\n",
+        "if sys.version_info.major >= 3:\n",
+        "    import pathlib\n",
+        "else:\n",
+        "    import pathlib2 as pathlib\n",
+        "\n",
+        "# Add `models` to the python path.\n",
+        "models_path = os.path.join(os.getcwd(), \"models\")\n",
+        "sys.path.append(models_path)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eQ6Q0qqKZogR"
+      },
+      "source": [
+        "### Train and export the model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "eMsw_6HujaqM",
+        "colab": {}
+      },
+      "source": [
+        "saved_models_root = \"/tmp/mnist_saved_model\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "hWSAjQWagIHl",
+        "colab": {}
+      },
+      "source": [
+        "# The above path addition is not visible to subprocesses, add the path for the subprocess as well.\n",
+        "# Note: channels_last is required here or the conversion may fail. \n",
+        "!PYTHONPATH={models_path} python models/official/mnist/mnist.py --train_epochs=1 --export_dir {saved_models_root} --data_format=channels_last"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "5NMaNZQCkW9X"
+      },
+      "source": [
+        "For the example, we only trained the model for a single epoch, so it only trains to ~96% accuracy.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xl8_fzVAZwOh"
+      },
+      "source": [
+        "### Convert to a TensorFlow Lite model\n",
+        "\n",
+        "The `savedmodel` directory is named with a timestamp. Select the most recent one: "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "Xp5oClaZkbtn",
+        "colab": {}
+      },
+      "source": [
+        "saved_model_dir = str(sorted(pathlib.Path(saved_models_root).glob(\"*\"))[-1])\n",
+        "saved_model_dir"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "AT8BgkKmljOy"
+      },
+      "source": [
+        "Using the [Python `TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api), the saved model can be converted into a TensorFlow Lite model.\n",
+        "\n",
+        "First load the model using the `TFLiteConverter`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "_i8B2nDZmAgQ",
+        "colab": {}
+      },
+      "source": [
+        "import tensorflow as tf\n",
+        "tf.enable_eager_execution()\n",
+        "tf.logging.set_verbosity(tf.logging.DEBUG)\n",
+        "\n",
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
+        "tflite_model = converter.convert()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "F2o2ZfF0aiCx"
+      },
+      "source": [
+        "Write it out to a `.tflite` file:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "vptWZq2xnclo",
+        "colab": {}
+      },
+      "source": [
+        "tflite_models_dir = pathlib.Path(\"/tmp/mnist_tflite_models/\")\n",
+        "tflite_models_dir.mkdir(exist_ok=True, parents=True)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "Ie9pQaQrn5ue",
+        "colab": {}
+      },
+      "source": [
+        "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
+        "tflite_model_file.write_bytes(tflite_model)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7BONhYtYocQY"
+      },
+      "source": [
+        "To instead quantize the model on export, first set the `optimizations` flag to optimize for size:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "HEZ6ET1AHAS3",
+        "colab": {}
+      },
+      "source": [
+        "tf.logging.set_verbosity(tf.logging.INFO)\n",
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rTe8avZJHMDO",
+        "colab_type": "text"
+      },
+      "source": [
+        "Now, construct and provide a representative dataset, this is used to get the dynamic range of activations."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "FiwiWU3gHdkW",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "mnist_train, _ = tf.keras.datasets.mnist.load_data()\n",
+        "images = tf.cast(mnist_train[0], tf.float32)/255.0\n",
+        "mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)\n",
+        "def representative_data_gen():\n",
+        "  for input_value in mnist_ds.take(100):\n",
+        "    yield [input_value]\n",
+        "\n",
+        "converter.representative_dataset = representative_data_gen"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xW84iMYjHd9t",
+        "colab_type": "text"
+      },
+      "source": [
+        "Finally, convert the model like usual. Note, by default the converted model will still use float input and outputs for invocation convenience."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yuNfl3CoHNK3",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "tflite_quant_model = converter.convert()\n",
+        "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
+        "tflite_model_quant_file.write_bytes(tflite_quant_model)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "PhMmUTl4sbkz"
+      },
+      "source": [
+        "Note how the resulting file is approximately `1/4` the size."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "JExfcfLDscu4",
+        "colab": {}
+      },
+      "source": [
+        "!ls -lh {tflite_models_dir}"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "L8lQHMp_asCq"
+      },
+      "source": [
+        "## Run the TensorFlow Lite models"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-5l6-ciItvX6"
+      },
+      "source": [
+        "We can run the TensorFlow Lite model using the Python TensorFlow Lite\n",
+        "Interpreter. \n",
+        "\n",
+        "### Load the test data\n",
+        "\n",
+        "First, let's load the MNIST test data to feed to the model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "eTIuU07NuKFL",
+        "colab": {}
+      },
+      "source": [
+        "import numpy as np\n",
+        "_, mnist_test = tf.keras.datasets.mnist.load_data()\n",
+        "images, labels = tf.cast(mnist_test[0], tf.float32)/255.0, mnist_test[1]\n",
+        "\n",
+        "mnist_ds = tf.data.Dataset.from_tensor_slices((images, labels)).batch(1)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Ap_jE7QRvhPf"
+      },
+      "source": [
+        "### Load the model into the interpreters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "Jn16Rc23zTss",
+        "colab": {}
+      },
+      "source": [
+        "interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))\n",
+        "interpreter.allocate_tensors()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "J8Pztk1mvNVL",
+        "colab": {}
+      },
+      "source": [
+        "interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))\n",
+        "interpreter_quant.allocate_tensors()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "2opUt_JTdyEu"
+      },
+      "source": [
+        "### Test the models on one image"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "AKslvo2kwWac",
+        "colab": {}
+      },
+      "source": [
+        "for img, label in mnist_ds:\n",
+        "  break\n",
+        "\n",
+        "interpreter.set_tensor(interpreter.get_input_details()[0][\"index\"], img)\n",
+        "interpreter.invoke()\n",
+        "predictions = interpreter.get_tensor(\n",
+        "    interpreter.get_output_details()[0][\"index\"])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "XZClM2vo3_bm",
+        "colab": {}
+      },
+      "source": [
+        "import matplotlib.pylab as plt\n",
+        "\n",
+        "plt.imshow(img[0])\n",
+        "template = \"True:{true}, predicted:{predict}\"\n",
+        "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
+        "                              predict=str(predictions[0])))\n",
+        "plt.grid(False)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "3gwhv4lKbYZ4",
+        "colab": {}
+      },
+      "source": [
+        "interpreter_quant.set_tensor(\n",
+        "    interpreter_quant.get_input_details()[0][\"index\"], img)\n",
+        "interpreter_quant.invoke()\n",
+        "predictions = interpreter_quant.get_tensor(\n",
+        "    interpreter_quant.get_output_details()[0][\"index\"])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "CIH7G_MwbY2x",
+        "colab": {}
+      },
+      "source": [
+        "plt.imshow(img[0])\n",
+        "template = \"True:{true}, predicted:{predict}\"\n",
+        "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
+        "                              predict=str(predictions[0])))\n",
+        "plt.grid(False)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "LwN7uIdCd8Gw"
+      },
+      "source": [
+        "### Evaluate the models"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "05aeAuWjvjPx",
+        "colab": {}
+      },
+      "source": [
+        "def eval_model(interpreter, mnist_ds):\n",
+        "  total_seen = 0\n",
+        "  num_correct = 0\n",
+        "\n",
+        "  input_index = interpreter.get_input_details()[0][\"index\"]\n",
+        "  output_index = interpreter.get_output_details()[0][\"index\"]\n",
+        "  for img, label in mnist_ds:\n",
+        "    total_seen += 1\n",
+        "    interpreter.set_tensor(input_index, img)\n",
+        "    interpreter.invoke()\n",
+        "    predictions = interpreter.get_tensor(output_index)\n",
+        "    if predictions == label.numpy():\n",
+        "      num_correct += 1\n",
+        "\n",
+        "    if total_seen % 500 == 0:\n",
+        "      print(\"Accuracy after %i images: %f\" %\n",
+        "            (total_seen, float(num_correct) / float(total_seen)))\n",
+        "\n",
+        "  return float(num_correct) / float(total_seen)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "T5mWkSbMcU5z",
+        "colab": {}
+      },
+      "source": [
+        "print(eval_model(interpreter, mnist_ds))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Km3cY9ry8ZlG"
+      },
+      "source": [
+        "We can repeat the evaluation on the fully quantized model to obtain:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "colab_type": "code",
+        "id": "-9cnwiPp6EGm",
+        "colab": {}
+      },
+      "source": [
+        "# NOTE: Colab runs on server CPUs. At the time of writing this, TensorFlow Lite\n",
+        "# doesn't have super optimized server CPU kernels. For this reason this may be\n",
+        "# slower than the above float interpreter. But for mobile CPUs, considerable\n",
+        "# speedup can be observed.\n",
+        "print(eval_model(interpreter_quant, mnist_ds))\n"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "L7lfxkor8pgv"
+      },
+      "source": [
+        "In this example, we have fully quantized a model with no difference in the accuracy."
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index 7e0f6238acd..0d03e4b502e 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/util.h"
 
+#include <complex>
 #include <cstring>
 
 namespace tflite {
@@ -56,4 +57,47 @@ size_t CombineHashes(std::initializer_list<size_t> hashes) {
   return result;
 }
 
+TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
+                           size_t* bytes) {
+  // TODO(levp): remove the default case so that new types produce compilation
+  // error.
+  switch (type) {
+    case kTfLiteFloat32:
+      *bytes = sizeof(float);
+      break;
+    case kTfLiteInt32:
+      *bytes = sizeof(int);
+      break;
+    case kTfLiteUInt8:
+      *bytes = sizeof(uint8_t);
+      break;
+    case kTfLiteInt64:
+      *bytes = sizeof(int64_t);
+      break;
+    case kTfLiteBool:
+      *bytes = sizeof(bool);
+      break;
+    case kTfLiteComplex64:
+      *bytes = sizeof(std::complex<float>);
+      break;
+    case kTfLiteInt16:
+      *bytes = sizeof(int16_t);
+      break;
+    case kTfLiteInt8:
+      *bytes = sizeof(int8_t);
+      break;
+    case kTfLiteFloat16:
+      *bytes = sizeof(TfLiteFloat16);
+      break;
+    default:
+      context->ReportError(
+          context,
+          "Type %d is unsupported. Only float32, int8, int16, int32, int64, "
+          "uint8, bool, complex64 supported currently.",
+          type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index dbb87528d06..a2c40303e9b 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_UTIL_H_
 
 #include <vector>
+
 #include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
@@ -58,6 +59,11 @@ struct TfLiteIntArrayDeleter {
   }
 };
 
+// Populates the size in bytes of a type into `bytes`. Returns kTfLiteOk for
+// valid types, and kTfLiteError otherwise.
+TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
+                           size_t* bytes);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index f458ed940d0..7f3e758dca1 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -45,12 +45,14 @@ tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
 tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
 tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
 tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
 tensorflow/third_party/toolchains/preconfig/generate/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
 tensorflow/third_party/toolchains/preconfig/centos6/py/BUILD
@@ -63,8 +65,7 @@ tensorflow/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/centos6/gcc7/BUILD
 tensorflow/third_party/toolchains/preconfig/centos6/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
-tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_025/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
 tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 85e08249057..a545ac41a9f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -180,7 +180,11 @@ py_library(
         "//tensorflow/python/compiler",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:distribute_config",
         "//tensorflow/python/distribute:estimator_training",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:monitoring",
@@ -606,6 +610,7 @@ cc_library(
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
@@ -1387,6 +1392,7 @@ tf_py_test(
     main = "framework/contrib_test.py",
     tags = [
         "no_pip",
+        "no_windows",
     ],
 )
 
@@ -1495,6 +1501,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -2456,6 +2463,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":platform",
+        ":util",
     ],
 )
 
@@ -2965,6 +2973,7 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -6441,6 +6450,8 @@ cuda_py_test(
     tags = [
         "grappler",
     ],
+    # This test analyzes the graph, but XLA changes the names of nodes.
+    xla_enable_strict_auto_jit = False,
 )
 
 tf_gen_op_wrapper_private_py(
diff --git a/tensorflow/python/autograph/BUILD b/tensorflow/python/autograph/BUILD
index 3289b447e7d..2ebae2e96e1 100644
--- a/tensorflow/python/autograph/BUILD
+++ b/tensorflow/python/autograph/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index a924391d0d8..0d2d8a148fd 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -36,26 +36,28 @@ from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core.converter import ConversionOptions
 from tensorflow.python.autograph.core.converter import Feature
+from tensorflow.python.autograph.impl.api import AutoGraphError
 from tensorflow.python.autograph.impl.api import convert
 from tensorflow.python.autograph.impl.api import converted_call
 from tensorflow.python.autograph.impl.api import do_not_convert
 from tensorflow.python.autograph.impl.api import RunMode
+from tensorflow.python.autograph.impl.api import StackTraceMapper
 from tensorflow.python.autograph.impl.api import to_code
 from tensorflow.python.autograph.impl.api import to_graph
 from tensorflow.python.autograph.lang.directives import set_element_type
 from tensorflow.python.autograph.lang.directives import set_loop_options
 from tensorflow.python.autograph.lang.special_functions import stack
-from tensorflow.python.autograph.pyct.errors import AutoGraphError
-from tensorflow.python.autograph.pyct.errors import StagingError
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.util.all_util import remove_undocumented
 
 # TODO(mdan): Revisit this list once we finalize the generated code mechanism.
 _allowed_symbols = [
     # Main API
+    'AutoGraphError',
     'ConversionOptions',
     'Feature',
     'RunMode',
+    'StackTraceMapper',
     'convert',
     'converted_call',
     'do_not_convert',
@@ -68,9 +70,6 @@ _allowed_symbols = [
     'set_loop_options',
     'stack',
     'tensor_list',
-    # Exceptions
-    'AutoGraphError',
-    'StagingError',
     # Utilities: to be removed
     'utils',
 ]
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index e2dc4b071c3..c5fd8a21bb4 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 4a58d527ffc..297aa7992b9 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import collections
 
+import numpy as np
+
 from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
@@ -37,7 +39,7 @@ class ControlFlowTest(converter_testing.TestCase):
       symbols = {}
     with self.converted(test_fn, control_flow, symbols,
                         constant_op.constant) as result:
-      self.assertEqual(self.evaluate(result.test_fn(*inputs)), expected)
+      self.assertAllEqual(self.evaluate(result.test_fn(*inputs)), expected)
 
   @test_util.run_deprecated_v1
   def test_while_basic(self):
@@ -357,6 +359,19 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, [3, 3], 7)
 
+  def test_for_with_comprehension_in_body(self):
+
+    def test_fn(l, n):
+      s = constant_op.constant(list(range(n)))
+      for _ in l:
+        s += constant_op.constant([a for a in range(n)])
+      return s
+
+    self.assertTransformedResult(
+        test_fn, (constant_op.constant([1, 2, 3]), 5),
+        np.array(range(5)) * 4,
+        symbols={'constant_op': constant_op})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/logical_expressions.py b/tensorflow/python/autograph/converters/logical_expressions.py
index ea9740a22e1..6f2c0ca029b 100644
--- a/tensorflow/python/autograph/converters/logical_expressions.py
+++ b/tensorflow/python/autograph/converters/logical_expressions.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Converter for logical expressions.
-
-e.g. `a and b -> tf.logical_and(a, b)`. This is not done automatically in TF.
-"""
+"""Converter for logical expressions, e.g. `a and b -> tf.logical_and(a, b)`."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,7 +21,6 @@ from __future__ import print_function
 import gast
 
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 
@@ -38,128 +34,104 @@ from tensorflow.python.autograph.pyct import templates
 SAFE_BOOLEAN_OPERAND = 'SAFE_BOOLEAN_OPERAND'
 
 
-OP_MAPPING = {
+LOGICAL_OPERATORS = {
     gast.And: 'ag__.and_',
+    gast.Not: 'ag__.not_',
+    gast.Or: 'ag__.or_',
+}
+
+EQUALITY_OPERATORS = {
     gast.Eq: 'ag__.eq',
     gast.NotEq: 'ag__.not_eq',
-    gast.Lt: 'ag__.lt',
-    gast.LtE: 'ag__.lt_e',
-    gast.Gt: 'ag__.gt',
-    gast.GtE: 'ag__.gt_e',
-    gast.Is: 'ag__.is_',
-    gast.IsNot: 'ag__.is_not',
-    gast.In: 'ag__.in_',
-    gast.Not: 'ag__.not_',
-    gast.NotIn: 'ag__.not_in',
-    gast.Or: 'ag__.or_',
-    gast.UAdd: 'ag__.u_add',
-    gast.USub: 'ag__.u_sub',
-    gast.Invert: 'ag__.invert',
 }
 
 
 class LogicalExpressionTransformer(converter.Base):
   """Converts logical expressions to corresponding TF calls."""
 
-  def _expect_simple_symbol(self, operand):
-    if isinstance(operand, gast.Name):
-      return
-    if anno.hasanno(operand, SAFE_BOOLEAN_OPERAND):
-      return
-    raise NotImplementedError(
-        'only simple local variables are supported in logical and compound '
-        'comparison expressions; for example, we support "a or b" but not '
-        '"a.x or b"; for a workaround, assign the expression to a local '
-        'variable and use that instead, for example "tmp = a.x", "tmp or b"')
-
-  def _has_matching_func(self, operator):
+  def _overload_of(self, operator):
     op_type = type(operator)
-    return op_type in OP_MAPPING
+    if op_type in LOGICAL_OPERATORS:
+      return LOGICAL_OPERATORS[op_type]
+    if self.ctx.program.options.uses(converter.Feature.EQUALITY_OPERATORS):
+      if op_type in EQUALITY_OPERATORS:
+        return EQUALITY_OPERATORS[op_type]
+    return None
 
-  def _matching_func(self, operator):
-    op_type = type(operator)
-    return OP_MAPPING[op_type]
+  def _as_lambda(self, expr):
+    return templates.replace_as_expression('lambda: expr', expr=expr)
 
-  def _as_function(self, func_name, args, args_as_lambda=False):
-    if args_as_lambda:
-      args_as_lambda = []
-      for arg in args:
-        template = """
-          lambda: arg
-        """
-        args_as_lambda.append(
-            templates.replace_as_expression(template, arg=arg))
-      args = args_as_lambda
+  def _as_binary_function(self, func_name, arg1, arg2):
+    return templates.replace_as_expression(
+        'func_name(arg1, arg2)',
+        func_name=parser.parse_expression(func_name),
+        arg1=arg1,
+        arg2=arg2)
 
-    if not args:
-      template = """
-        func_name()
-      """
-      replacement = templates.replace_as_expression(
-          template, func_name=parser.parse_expression(func_name))
-    elif len(args) == 1:
-      template = """
-        func_name(arg)
-      """
-      replacement = templates.replace_as_expression(
-          template, func_name=parser.parse_expression(func_name), arg=args[0])
-    elif len(args) == 2:
-      template = """
-        func_name(arg1, arg2)
-      """
-      replacement = templates.replace_as_expression(
-          template,
-          func_name=parser.parse_expression(func_name),
-          arg1=args[0],
-          arg2=args[1])
-    else:
-      raise NotImplementedError('{} arguments for {}'.format(
-          len(args), func_name))
+  def _as_binary_operation(self, op, arg1, arg2):
+    template = templates.replace_as_expression(
+        'arg1 is arg2',
+        arg1=arg1,
+        arg2=arg2)
+    template.ops[0] = op
+    return template
 
-    anno.setanno(replacement, SAFE_BOOLEAN_OPERAND, True)
-    return replacement
+  def _as_unary_function(self, func_name, arg):
+    return templates.replace_as_expression(
+        'func_name(arg)', func_name=parser.parse_expression(func_name), arg=arg)
 
   def visit_Compare(self, node):
     node = self.generic_visit(node)
 
+    if (not self.ctx.program.options.uses(
+        converter.Feature.EQUALITY_OPERATORS)):
+      return node
+
     ops_and_comps = list(zip(node.ops, node.comparators))
     left = node.left
-    op_tree = None
 
     # Repeated comparisons are converted to conjunctions:
     #   a < b < c   ->   a < b and b < c
+    op_tree = None
     while ops_and_comps:
       op, right = ops_and_comps.pop(0)
-      binary_comparison = self._as_function(
-          self._matching_func(op), (left, right))
-      if isinstance(left, gast.Name) and isinstance(right, gast.Name):
-        anno.setanno(binary_comparison, SAFE_BOOLEAN_OPERAND, True)
-      if op_tree:
-        self._expect_simple_symbol(right)
-        op_tree = self._as_function(
-            'ag__.and_', (op_tree, binary_comparison), args_as_lambda=True)
+      overload = self._overload_of(op)
+      if overload is not None:
+        binary_comparison = self._as_binary_function(overload, left, right)
+      else:
+        binary_comparison = self._as_binary_operation(op, left, right)
+      if op_tree is not None:
+        op_tree = self._as_binary_function('ag__.and_',
+                                           self._as_lambda(op_tree),
+                                           self._as_lambda(binary_comparison))
       else:
         op_tree = binary_comparison
       left = right
+
     assert op_tree is not None
     return op_tree
 
   def visit_UnaryOp(self, node):
     node = self.generic_visit(node)
-    return self._as_function(self._matching_func(node.op), (node.operand,))
+
+    overload = self._overload_of(node.op)
+    if overload is None:
+      return node
+
+    return self._as_unary_function(overload, node.operand)
 
   def visit_BoolOp(self, node):
     node = self.generic_visit(node)
     node_values = node.values
     right = node.values.pop()
-    self._expect_simple_symbol(right)
     while node_values:
       left = node_values.pop()
-      self._expect_simple_symbol(left)
-      right = self._as_function(
-          self._matching_func(node.op), (left, right), args_as_lambda=True)
+      right = self._as_binary_function(
+          self._overload_of(node.op), self._as_lambda(left),
+          self._as_lambda(right))
     return right
 
 
 def transform(node, ctx):
-  return LogicalExpressionTransformer(ctx).visit(node)
+  transformer = LogicalExpressionTransformer(ctx)
+  return transformer.visit(node)
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index f2b69993733..d3652193e39 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 061c4cfaaf3..5b265cf965c 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -96,9 +96,10 @@ class Feature(enum.Enum):
     ASSERT_STATEMENTS: Convert Tensor-dependent assert statements to tf.Assert.
     BUILTIN_FUNCTIONS: Convert builtin functions applied to Tensors to
       their TF counterparts.
+    EQUALITY_OPERATORS: Whether to convert the comparison operators, like
+      equality. This is soon to be deprecated as support is being added to the
+      Tensor class.
     LISTS: Convert list idioms, like initializers, slices, append, etc.
-    LOGICAL_EXPRESSIONS: Convert data-dependent logical expressions applied to
-      Tensors to their TF counterparts.
     NAME_SCOPES: Insert name scopes that name ops according to context, like the
       function they were defined in.
   """
@@ -108,8 +109,8 @@ class Feature(enum.Enum):
   AUTO_CONTROL_DEPS = 'AUTO_CONTROL_DEPS'
   ASSERT_STATEMENTS = 'ASSERT_STATEMENTS'
   BUILTIN_FUNCTIONS = 'BUILTIN_FUNCTIONS'
+  EQUALITY_OPERATORS = 'EQUALITY_OPERATORS'
   LISTS = 'LISTS'
-  LOGICAL_EXPRESSIONS = 'LOGICAL_EXPRESSIONS'
   NAME_SCOPES = 'NAME_SCOPES'
 
   @classmethod
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index 849df23e6cd..3297b717319 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -134,6 +134,6 @@ class TestCase(test.TestCase):
         future_features=future_features,
         namespace=namespace)
     ctx = converter.EntityContext(namer, entity_info, program_ctx)
-    origin_info.resolve(node, source, test_fn)
+    origin_info.resolve_entity(node, source, test_fn)
     node = converter.standard_analysis(node, ctx, is_initial=True)
     return node, ctx
diff --git a/tensorflow/python/autograph/core/unsupported_features_checker.py b/tensorflow/python/autograph/core/unsupported_features_checker.py
index 6ccbb76fea1..3ec1f8ee5b9 100644
--- a/tensorflow/python/autograph/core/unsupported_features_checker.py
+++ b/tensorflow/python/autograph/core/unsupported_features_checker.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import gast
 
-from tensorflow.python.autograph.pyct import errors
-
 
 class UnsupportedFeaturesChecker(gast.NodeTransformer):
   """Quick check for Python features we know we don't support.
@@ -31,22 +29,18 @@ class UnsupportedFeaturesChecker(gast.NodeTransformer):
 
   # TODO(b/124103128): Implement support for `global` statements
   def visit_Global(self, node):
-    raise errors.AutoGraphError(
-        'The global keyword is not yet supported.')
+    raise NotImplementedError('The global keyword is not yet supported.')
 
   def visit_Nonlocal(self, node):
-    raise errors.AutoGraphError(
-        'The nonlocal keyword is not yet supported.')
+    raise NotImplementedError('The nonlocal keyword is not yet supported.')
 
   # These checks could potentially be replaced with inspect.isgeneratorfunction
   # to avoid a getsource/parse/ast-walk round trip.
   def visit_Yield(self, node):
-    raise errors.AutoGraphError(
-        'Generators are not supported by AutoGraph')
+    raise NotImplementedError('Generators are not supported by AutoGraph')
 
   def visit_YieldFrom(self, node):
-    raise errors.AutoGraphError(
-        'Generators are not supported by AutoGraph')
+    raise NotImplementedError('Generators are not supported by AutoGraph')
 
 
 def verify(node):
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index fe205cdac55..8a1d9f190aa 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index df75aa089b3..2a594914b2d 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -27,18 +27,25 @@ import pdb
 import re
 import sys
 import textwrap
-
+import traceback
 from enum import Enum
 
+# pylint:disable=g-bad-import-order
+import six
+# pylint:enable=g-bad-import-order
+
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import conversion
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.autograph.utils import py_func
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_stack
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -46,17 +53,84 @@ def is_autograph_strict_conversion_mode():
   return int(os.environ.get('AUTOGRAPH_STRICT_CONVERSION', '0')) > 0
 
 
-# TODO(mdan): Properly document the type hints.
-# TODO(mdan): Reduce the type hint information to (module, type).
-# (currently we require (module + class name, type))
+# TODO(mdan): Export this symbol.
+class AutoGraphError(Exception):
+  """Base class for all AutoGraph exceptions."""
+  pass
+
+
+class ConversionError(AutoGraphError):
+  """Raised during the conversion process."""
+  pass
+
+
+class StagingError(AutoGraphError):
+  """Raised during the staging (i.e. Python execution) of converted code."""
+  pass
+
+
+class _ErrorMetadata(errors.ErrorMetadataBase):
+  """AutoGraph-specific error metadata. See base class."""
+
+  def create_exception(self, preferred_type):
+    if preferred_type == errors_impl.OpError:
+      # Best-effort unpacking of OpError exceptions.
+      # TODO(mdan): Use a mechanism that is more future-proof.
+      t = type(self.cause)
+      init_argspec = tf_inspect.getfullargspec(t.__init__)
+      message = self.get_message()
+      init_args = tuple(init_argspec.argspec)
+      # At the time of this writing, TF errors either take 3 or 4 arguments,
+      # with the fourth being error_code.
+      if init_args == ('self', 'node_def', 'op', 'message', 'error_code'):
+        return t(
+            node_def=self.cause.node_def,
+            op=self.cause.op,
+            message=message,
+            error_code=self.error_code)
+      elif init_args == ('self', 'node_def', 'op', 'message'):
+        if 'error_code' in init_argspec.kwonlyargs:
+          return t(
+              node_def=self.cause.node_def,
+              op=self.cause.op,
+              message=message,
+              errro_code=self.error_code)
+        else:
+          return t(
+              node_def=self.cause.node_def, op=self.cause.op, message=message)
+
+    elif preferred_type in (AutoGraphError, ConversionError, StagingError):
+      return preferred_type(self.get_message())
+
+    exc = super(_ErrorMetadata, self).create_exception(preferred_type)
+    if exc is not None:
+      return exc
+
+    # Note: While changing an error's message property to change the message it
+    # displays will probably work a lot of times, there is no standard way in
+    # Python to do that. The safest way is therefore to create a new exception.
+    # For user defined exceptions, we could define an interface that allowed
+    # them to work under this mechanism.
+    return StagingError(self.get_message())
+
+
+class StackTraceMapper(tf_stack.StackTraceMapper):
+  """Remaps generated code to code it originated from."""
+
+  def __init__(self, converted_fn):
+    self._source_map = converted_fn.ag_source_map
+
+  def map(self, filename, lineno, name):
+    loc = origin_info.LineLocation(filename=filename, lineno=lineno)
+    if loc not in self._source_map:
+      return filename, lineno, name
+
+    origin = self._source_map[loc]
+    return origin.loc.filename, origin.loc.lineno, origin.function_name
 
 
 # TODO(mdan): This should behave like to_graph (e.g. convert statically).
-# TODO(znado): Make an alias so can write Verbosity directly without needing
-# to write converter.
-def convert(
-    recursive=False,
-    optional_features=None):
+def convert(recursive=False, optional_features=None):
   """Decorator that compiles a function to use TensorFlow ops.
 
   The decorator is dynamic - it recompiles the target whenever the decorated
@@ -81,13 +155,20 @@ def convert(
 
     @functools.wraps(f)
     def wrapper(*args, **kwargs):
-      return converted_call(
-          f, None,
-          converter.ConversionOptions(
-              recursive=recursive,
-              force_conversion=True,
-              optional_features=optional_features,
-          ), args, kwargs)
+      """Wrapper that calls the converted version of f."""
+      try:
+        return converted_call(
+            f, None,
+            converter.ConversionOptions(
+                recursive=recursive,
+                force_conversion=True,
+                optional_features=optional_features,
+            ), args, kwargs)
+      except Exception as e:  # pylint:disable=broad-except
+        if hasattr(e, 'ag_error_metadata'):
+          raise e.ag_error_metadata.to_exception(type(e))
+        else:
+          raise
 
     wrapper = tf_decorator.make_decorator(f, wrapper)
 
@@ -104,10 +185,10 @@ class RunMode(Enum):
 
   Attributes:
    * GRAPH: Call this function directly, as-is. This is suitable for functions
-       that were already designed for TF graphs and contain ops.
+     that were already designed for TF graphs and contain ops.
    * PY_FUNC: Wrap this function into a py_func op. This is suitable for code
-       that will only run correctly in Python, for example code that renders
-       to the display, reads keyboard input, etc.
+     that will only run correctly in Python, for example code that renders to
+     the display, reads keyboard input, etc.
   """
   GRAPH = 1
   PY_FUNC = 2
@@ -119,6 +200,7 @@ def do_not_convert_internal(f):
   return f
 
 
+@tf_export('autograph.experimental.do_not_convert')
 def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
   """Decorator that suppresses the conversion of a function.
 
@@ -163,15 +245,38 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
   return decorator
 
 
+def _attach_metadata(e, f, converted):
+  """Augments an error with the metadata necessary for rewrite."""
+  if hasattr(e, 'ag_pass_through'):
+    return
+
+  metadata = getattr(e, 'ag_error_metadata', None)
+  source_map = f.ag_source_map if converted else {}
+
+  if metadata is None:
+    logging.log(
+        1, 'Caught error in %s (converted=%s)', f, converted, exc_info=True)
+    message = '{}: {}'.format(e.__class__.__name__, e)
+  else:
+    message = None
+
+  cause_tb = traceback.extract_tb(sys.exc_info()[2])[1:]
+  e.ag_error_metadata = _ErrorMetadata(cause_tb, metadata, message, source_map)
+
+
 def _call_unconverted(f, args, kwargs):
   """Calls the original function without converting with AutoGraph."""
   if inspect_utils.istfmethodtarget(f):
     return f.__self__.call(args, kwargs)
 
-  if kwargs is not None:
-    return f(*args, **kwargs)
-  else:
-    return f(*args)
+  try:
+    if kwargs is not None:
+      return f(*args, **kwargs)
+    else:
+      return f(*args)
+  except Exception as e:  # pylint:disable=broad-except
+    _attach_metadata(e, f, False)
+    raise
 
 
 def _is_known_loaded_type(f, module_name, entity_name):
@@ -224,9 +329,8 @@ def converted_call(f, owner, options, args, kwargs):
     else:
       composite_desc = ''
 
-    logging.log(1,
-                'Converted call: %s %s\n    args: %s\n    kwargs: %s\n',
-                f, composite_desc, args, kwargs)
+    logging.log(1, 'Converted call: %s %s\n    args: %s\n    kwargs: %s\n', f,
+                composite_desc, args, kwargs)
 
   if inspect_utils.isbuiltin(f):
     if kwargs:
@@ -264,8 +368,8 @@ def converted_call(f, owner, options, args, kwargs):
 
   # Custom ops and kernels are also permanently whitelisted.
   # See tensorflow.framework.load_library.
-  if (hasattr(f, '__module__')
-      and hasattr(f.__module__, '_IS_TENSORFLOW_PLUGIN')):
+  if (hasattr(f, '__module__') and
+      hasattr(f.__module__, '_IS_TENSORFLOW_PLUGIN')):
     logging.log(2, 'Permanently whitelisted: %s: TensorFlow plugin', f)
     return _call_unconverted(f, args, kwargs)
 
@@ -324,15 +428,14 @@ def converted_call(f, owner, options, args, kwargs):
 
     if not tf_inspect.isclass(target_entity):
       if not hasattr(target_entity, '__code__'):
-        logging.log(
-            2, 'Permanently whitelisted: %s: native binding', target_entity)
+        logging.log(2, 'Permanently whitelisted: %s: native binding',
+                    target_entity)
         return _call_unconverted(f, args, kwargs)
       elif (hasattr(target_entity.__code__, 'co_filename') and
             target_entity.__code__.co_filename == '<string>'):
         # TODO(mdan): __globals__['txt'] might work in Py3.
-        logging.log(
-            2, 'Permanently whitelisted: %s: dynamic code (exec?)',
-            target_entity)
+        logging.log(2, 'Permanently whitelisted: %s: dynamic code (exec?)',
+                    target_entity)
         return _call_unconverted(f, args, kwargs)
 
     converted_f = to_graph(
@@ -343,54 +446,46 @@ def converted_call(f, owner, options, args, kwargs):
     if logging.has_verbosity(2):
       logging.log(2, 'Defaults of %s : %s', converted_f,
                   converted_f.__defaults__)
+      if six.PY3:
+        logging.log(2, 'KW defaults of %s : %s',
+                    converted_f, converted_f.__kwdefaults__)
+
       if kwargs is not None:
-        callargs = tf_inspect.getcallargs(
-            converted_f, *effective_args, **kwargs)
+        callargs = tf_inspect.getcallargs(converted_f, *effective_args,
+                                          **kwargs)
       else:
         callargs = tf_inspect.getcallargs(converted_f, *effective_args)
+
       formatted_callargs = '\n'.join(
           '    {}: {}'.format(k, v) for k, v in callargs.items())
       logging.log(2, 'Calling %s with\n%s\n', converted_f, formatted_callargs)
 
-  # TODO(mdan): Reduce this list.
-  except (errors.AutoGraphError, AssertionError, AttributeError, IndexError,
-          KeyError, NameError, NotImplementedError, SyntaxError, TypeError,
-          ValueError, IOError) as e:
-
+  except Exception as e:  # pylint:disable=broad-except
     logging.log(1, 'Error transforming entity %s', target_entity, exc_info=True)
-
     if is_autograph_strict_conversion_mode():
       raise
-
     logging.warn(
         'Entity %s could not be transformed and will be executed as-is.'
-        ' Some features (e.g. tensor-dependent conditionals and loops) may not'
-        ' work as expected.'
-        ' Error details can be found in the logs when running with the env'
-        ' variable AUTOGRAPH_VERBOSITY >= 1. Please report this to the'
-        ' AutoGraph team. Cause: %s', target_entity, e)
-
+        ' Please report this to the AutgoGraph team. When filing the bug, set'
+        ' the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and'
+        ' attach the full output. Cause: %s', target_entity, e)
     return _call_unconverted(f, args, kwargs)
 
   try:
-    if kwargs is not None:
-      result = converted_f(*effective_args, **kwargs)
-    else:
-      result = converted_f(*effective_args)
-  except errors.StagingError as e:
-    target_origin = errors.extract_origin_info(converted_f)
-    raise errors.StagingError((target_origin,) + e.user_trace, e.original_error)
-  except errors.AutoGraphError as e:
-    target_origin = errors.extract_origin_info(converted_f)
-    raise errors.StagingError((target_origin,), e)
+    with StackTraceMapper(converted_f), tf_stack.CurrentModuleFilter():
+      if kwargs is not None:
+        result = converted_f(*effective_args, **kwargs)
+      else:
+        result = converted_f(*effective_args)
+  except Exception as e:
+    _attach_metadata(e, converted_f, True)
+    raise
 
   return result
 
 
 @tf_export('autograph.to_graph', v1=[])
-def to_graph(entity,
-             recursive=True,
-             experimental_optional_features=None):
+def to_graph(entity, recursive=True, experimental_optional_features=None):
   """Converts a Python entity into a TensorFlow graph.
 
   Also see: `tf.autograph.to_code`, `tf.function`.
@@ -435,11 +530,11 @@ def to_graph(entity,
 
   Args:
     entity: Python callable or class to convert.
-    recursive: Whether to recursively convert any functions that the
-      converted function may call.
+    recursive: Whether to recursively convert any functions that the converted
+      function may call.
     experimental_optional_features: `None`, a tuple of, or a single
-      `tf.autograph.experimental.Feature` value. Controls the use of
-      optional features in the conversion process.
+      `tf.autograph.experimental.Feature` value. Controls the use of optional
+      features in the conversion process.
 
   Returns:
     Same as `entity`, the converted Python function or class.
@@ -455,7 +550,9 @@ def to_graph(entity,
         autograph_module=tf_inspect.getmodule(to_graph))
     return conversion.convert(entity, program_ctx)
   except (ValueError, AttributeError, KeyError, NameError, AssertionError) as e:
-    errors.report_internal_error(entity, e)
+    logging.error(1, 'Error converting %s', entity, exc_info=True)
+    raise ConversionError('converting {}: {}: {}'.format(
+        entity, e.__class__.__name__, str(e)))
 
 
 @tf_export(v1=['autograph.to_graph'])
@@ -568,9 +665,7 @@ def to_code_v1(entity,
 
 
 @tf_export('autograph.to_code', v1=[])
-def to_code(entity,
-            recursive=True,
-            experimental_optional_features=None):
+def to_code(entity, recursive=True, experimental_optional_features=None):
   """Similar to `to_graph`, but returns Python source code as a string.
 
   Also see: `tf.autograph.to_graph`.
@@ -580,11 +675,11 @@ def to_code(entity,
 
   Args:
     entity: Python callable or class to convert.
-    recursive: Whether to recursively convert any functions that the
-      converted function may call.
+    recursive: Whether to recursively convert any functions that the converted
+      function may call.
     experimental_optional_features: `None`, a tuple of, or a single
-      `tf.autograph.experimental.Feature` value. Controls the use of
-      optional features in the conversion process.
+      `tf.autograph.experimental.Feature` value. Controls the use of optional
+      features in the conversion process.
 
   Returns:
     The converted code as string.
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index d65a4ed8f60..087e12b1e3d 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -32,7 +32,6 @@ import numpy as np
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
-from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
@@ -665,9 +664,8 @@ class ApiTest(test.TestCase):
       testing_global_numeric = x + testing_global_numeric
       return testing_global_numeric
 
-    # TODO(b/122368197)
     with self.assertRaisesRegex(
-        errors.AutoGraphError, 'global keyword is not yet supported'):
+        NotImplementedError, 'global keyword is not yet supported'):
       api.to_graph(test_fn)
 
   def test_to_graph_with_kwargs_clashing_converted_call(self):
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 6979a1a4847..d2f7ae61c63 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -52,7 +52,6 @@ from tensorflow.python.autograph.core import unsupported_features_checker
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
-from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
@@ -284,6 +283,8 @@ def _instantiate(entity, converted_entity_info, free_nonglobal_var_names):
   if tf_inspect.isfunction(entity) or tf_inspect.ismethod(entity):
     # Attach the default argument to the converted function.
     converted_entity.__defaults__ = entity.__defaults__
+    if hasattr(entity, '__kwdefaults__'):
+      converted_entity.__kwdefaults__ = entity.__kwdefaults__
 
   return converted_entity
 
@@ -345,8 +346,8 @@ def is_whitelisted_for_graph(o, check_call_override=True):
 
   if tf_inspect.isgeneratorfunction(o):
     logging.warn(
-        'Entity {} appears to be a generator function. It will not be converted'
-        ' by AutoGraph.'.format(o), 1)
+        'Entity %s appears to be a generator function. It will not be converted'
+        ' by AutoGraph.', o)
     logging.log(2, 'Whitelisted: %s: generator functions are not converted', o)
     return True
 
@@ -393,11 +394,6 @@ def is_whitelisted_for_graph(o, check_call_override=True):
     # Due to the way they're constructed, namedtuple types cannot be converted
     # because they don't expose source code. But we assume they are safe for
     # graph mode since they are just containers.
-    if tf_inspect.isclass(o) and len(o.__bases__) > 1:
-      logging.warn(
-          'Entity {} looks like a namedtuple subclass. Its constructor will'
-          ' not be converted by AutoGraph, but if it has any custom methods,'
-          ' those will be.'.format(o), 1)
     logging.log(2, 'Whitelisted: %s: named tuple', o)
     return True
 
@@ -610,7 +606,8 @@ def convert_func_to_ast(f, program_ctx, do_rename=True):
     node, = nodes
 
   # TODO(znado): Place inside standard_analysis.
-  origin_info.resolve(node, source, f)
+  origin_info.resolve_entity(node, source, f)
+
   namespace = inspect_utils.getnamespace(f)
   _add_self_references(namespace, program_ctx.autograph_module)
   namer = naming.Namer(namespace)
@@ -621,12 +618,7 @@ def convert_func_to_ast(f, program_ctx, do_rename=True):
       future_features=future_features,
       namespace=namespace)
   context = converter.EntityContext(namer, entity_info, program_ctx)
-  try:
-    node = node_to_graph(node, context)
-  except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
-    logging.error(1, 'Error converting %s', f, exc_info=True)
-    raise errors.InternalError('conversion', e)
-    # TODO(mdan): Catch and rethrow syntax errors.
+  node = node_to_graph(node, context)
 
   if isinstance(node, gast.Lambda):
     new_name = namer.new_symbol('tf__lambda', ())
@@ -676,8 +668,7 @@ def node_to_graph(node, context):
   node = converter.apply_(node, context, call_trees)
   node = converter.apply_(node, context, control_flow)
   node = converter.apply_(node, context, conditional_expressions)
-  if context.program.options.uses(converter.Feature.LOGICAL_EXPRESSIONS):
-    node = converter.apply_(node, context, logical_expressions)
+  node = converter.apply_(node, context, logical_expressions)
   if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS):
     node = converter.apply_(node, context, side_effect_guards)
   # TODO(mdan): If function scopes ever does more, the toggle will need moving.
diff --git a/tensorflow/python/autograph/lang/BUILD b/tensorflow/python/autograph/lang/BUILD
index 462349cc10a..0cb31115be6 100644
--- a/tensorflow/python/autograph/lang/BUILD
+++ b/tensorflow/python/autograph/lang/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 167f9d81477..d52013e481a 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index bbc684eaf2b..17f5ff39a8e 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -49,20 +49,9 @@ from tensorflow.python.autograph.operators.data_structures import new_list
 from tensorflow.python.autograph.operators.exceptions import assert_stmt
 from tensorflow.python.autograph.operators.logical import and_
 from tensorflow.python.autograph.operators.logical import eq
-from tensorflow.python.autograph.operators.logical import gt
-from tensorflow.python.autograph.operators.logical import gt_e
-from tensorflow.python.autograph.operators.logical import in_
-from tensorflow.python.autograph.operators.logical import invert
-from tensorflow.python.autograph.operators.logical import is_
-from tensorflow.python.autograph.operators.logical import is_not
-from tensorflow.python.autograph.operators.logical import lt
-from tensorflow.python.autograph.operators.logical import lt_e
 from tensorflow.python.autograph.operators.logical import not_
 from tensorflow.python.autograph.operators.logical import not_eq
-from tensorflow.python.autograph.operators.logical import not_in
 from tensorflow.python.autograph.operators.logical import or_
-from tensorflow.python.autograph.operators.logical import u_add
-from tensorflow.python.autograph.operators.logical import u_sub
 from tensorflow.python.autograph.operators.py_builtins import float_
 from tensorflow.python.autograph.operators.py_builtins import int_
 from tensorflow.python.autograph.operators.py_builtins import len_
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 5575b4c1911..4116a734ecf 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.operators import special_values
-from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.experimental.ops import take_while_ops
@@ -344,7 +343,7 @@ class _PythonLoopChecker(object):
 
   def _check_unroll_limits(self):
     if LIMIT_PYTHON_ITERATIONS and self.iterations > PYTHON_MAX_ITERATIONS:
-      raise errors.ExecutionError('Python', 'iteration limit exceeded')
+      raise ValueError('iteration limit exceeded')
 
   def _stop_checking_inefficient_unroll(self):
     self.check_inefficient_unroll = False
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 72cf214236c..6dc23f1e10f 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -24,7 +24,6 @@ import sys
 import six
 
 from tensorflow.python.autograph.operators import control_flow
-from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
@@ -196,7 +195,7 @@ class WhileLoopTest(test.TestCase):
   def test_python_infinite_loop(self):
     if __debug__:
       with test.mock.patch.object(control_flow, 'PYTHON_MAX_ITERATIONS', 100):
-        with self.assertRaisesRegexp(errors.ExecutionError, 'iteration limit'):
+        with self.assertRaisesRegexp(ValueError, 'iteration limit'):
           control_flow.while_stmt(
               test=lambda _: True,
               body=lambda i: (i + 1,),
diff --git a/tensorflow/python/autograph/operators/logical.py b/tensorflow/python/autograph/operators/logical.py
index cafb0583e8f..81457580fbc 100644
--- a/tensorflow/python/autograph/operators/logical.py
+++ b/tensorflow/python/autograph/operators/logical.py
@@ -12,24 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Logical operators, including comparison and bool operators."""
+"""Logical boolean operators: not, and, or."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import operator
-
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 
 
-# Note: the implementations in this file are split into very small-grained
-# functions in preparation for the factoring out the more generic pyct library.
-# At that time, the py_* and tf_* functions will reside in different libraries.
-
-
 def not_(a):
   """Functional form of "not"."""
   if tensor_util.is_tensor(a):
@@ -105,30 +98,3 @@ def _py_equal(a, b):
 def not_eq(a, b):
   """Functional form of "not-equal"."""
   return not_(eq(a, b))
-
-
-# Default implementation for the rest.
-
-is_ = operator.is_
-is_not = operator.is_not
-
-
-def in_(a, b):
-  """Functional form of "in"."""
-  # TODO(mdan): in and not_in should probably be convertible for some types.
-  return a in b
-
-
-def not_in(a, b):
-  """Functional form of "not-in"."""
-  return a not in b
-
-gt = operator.gt
-gt_e = operator.ge
-lt = operator.lt
-lt_e = operator.le
-
-
-u_add = operator.pos
-u_sub = operator.neg
-invert = operator.invert
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index c594327a00d..0c9930043c9 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -40,8 +42,6 @@ py_library(
         "@gast_archive//:gast",
         "@six_archive//:six",
         "@termcolor_archive//:termcolor",
-        # TODO(mdan): Remove this dependency.
-        "//tensorflow/python:util",
     ],
 )
 
@@ -92,6 +92,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "errors_test",
+    srcs = ["errors_test.py"],
+    python_version = "PY2",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
 py_test(
     name = "inspect_utils_test",
     srcs = ["inspect_utils_test.py"],
@@ -108,9 +120,6 @@ py_test(
 sh_test(
     name = "inspect_utils_test_par",
     srcs = ["inspect_utils_test.sh"],
-    data = [
-        ":inspect_utils_test.par",
-    ],
     tags = ["no_oss"],
 )
 
diff --git a/tensorflow/python/autograph/pyct/cfg_test.py b/tensorflow/python/autograph/pyct/cfg_test.py
index 0d261c2c750..72ef9ee6865 100644
--- a/tensorflow/python/autograph/pyct/cfg_test.py
+++ b/tensorflow/python/autograph/pyct/cfg_test.py
@@ -1252,6 +1252,22 @@ class AstToCfgTest(test.TestCase):
         ),
     )
 
+  def test_list_comprehension(self):
+
+    def test_fn(a):
+      c = [b for b in a]
+      return c
+
+    graph, = self._build_cfg(test_fn).values()
+
+    self.assertGraphMatches(
+        graph,
+        (
+            ('a', 'c = [b for b in a]', 'return c'),
+            ('c = [b for b in a]', 'return c', None),
+        ),
+    )
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index 1106a19de1b..818bf718906 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/python/autograph/pyct/compiler.py b/tensorflow/python/autograph/pyct/compiler.py
index 5ab4a87459c..85d8153d0ee 100644
--- a/tensorflow/python/autograph/pyct/compiler.py
+++ b/tensorflow/python/autograph/pyct/compiler.py
@@ -118,10 +118,10 @@ def ast_to_object(nodes,
     nodes = (nodes,)
 
   source = ast_to_source(nodes, indentation=indentation)
-  module, filename = source_to_entity(source, delete_on_exit)
+  module, _ = source_to_entity(source, delete_on_exit)
 
   if include_source_map:
-    source_map = origin_info.create_source_map(nodes, source, filename)
+    source_map = origin_info.create_source_map(nodes, source, module.__file__)
   else:
     source_map = None
 
diff --git a/tensorflow/python/autograph/pyct/errors.py b/tensorflow/python/autograph/pyct/errors.py
index 48e9fbbf2ca..1f5b1ab53b9 100644
--- a/tensorflow/python/autograph/pyct/errors.py
+++ b/tensorflow/python/autograph/pyct/errors.py
@@ -18,102 +18,177 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-import traceback
+import collections
+import os
 
 from tensorflow.python.autograph.pyct import origin_info
-from tensorflow.python.autograph.utils import ag_logging
 
 
-class AutoGraphError(Exception):
+class FrameInfo(
+    collections.namedtuple(
+        'FrameInfo',
+        ('filename', 'lineno', 'function_name', 'code', 'converted'))):
   pass
 
 
-class ExecutionError(AutoGraphError):
-  """Raised by AutoGraph during various execution stages."""
+def _stack_trace_inside_mapped_code(tb, source_map):
+  """Summarizes inner traceback frames up to the call to a given function.
 
-  def __init__(self, stage, message):
-    super(ExecutionError, self).__init__()
-    self.stage = stage
-    self.message = message
+  This functions locates the innermost (i.e. most recent) frame that corresponds
+  to code that can be mapped by source_map originated from, and returns a
+  translated stack trace ending at that frame. If no such frame is found, the
+  entire stack trace is summarized.
 
-  def __str__(self):
-    return 'Runtime error during {} stage: {}'.format(self.stage, self.message)
+  For example, the following code:
+
+    def f():
+      for i in tf.range(1):
+        z = y + i  # z only defined here
+
+  Would generate this traceback:
+
+    <converted code>
+        ag__.for_stmt(...)
+    <for_stmt>
+        return _known_len_tf_for_stmt(iter_, extra_test, body, init_state)
+    <_known_len_tf_for_stmt>
+        _disallow_undefs_into_loop(*init_state)
+    <_disallow_undefs_into_loop>
+        raise ...
+
+  Which is then processed into:
+
+    <f>
+        for i in tf.range(1):
+    <for_stmt>
+        return _known_len_tf_for_stmt(iter_, extra_test, body, init_state)
+    <_known_len_tf_for_stmt>
+        _disallow_undefs_into_loop(*init_state)
+    <_disallow_undefs_into_loop>
+        raise ...
+
+  Args:
+    tb: List[Tuple], the traceback corresponding to an error; typically,
+      the output of traceback.extract_tb.
+    source_map: Dict[LineLocation, OriginInfo], a source map as created by
+      origin_info.create_source_map.
+
+  Returns:
+    List[FrameInfo]
+  """
+  result_frames = []
+  for filename, line_number, function_name, text in reversed(tb):
+
+    loc = origin_info.LineLocation(filename=filename, lineno=line_number)
+    if loc in source_map:
+      origin = source_map[loc]
+      origin_frame_info = FrameInfo(
+          filename=origin.loc.filename,
+          lineno=origin.loc.lineno,
+          function_name=origin.function_name,
+          code=origin.source_code_line,
+          converted=True)
+      result_frames.append(origin_frame_info)
+      break
+
+    fi = FrameInfo(
+        filename=filename,
+        lineno=line_number,
+        function_name=function_name,
+        code=text,
+        converted=False)
+    result_frames.append(fi)
+
+  return tuple(result_frames)
 
 
-class InternalError(AutoGraphError):
-  """Raised when AutoGraph finds an unexpected error."""
-
-  def __init__(self, message, original_exc):
-    super(InternalError, self).__init__()
-    self.message = message
-    self.original_exc = original_exc
-
-  def __str__(self):
-    return '{} during {}: {}'.format(
-        type(self.original_exc).__name__, self.message, self.original_exc)
+KNOWN_STRING_CONSTRUCTOR_ERRORS = (
+    AssertionError,
+    AttributeError,
+    KeyError,
+    NotImplementedError,
+    RuntimeError,
+    StopIteration,
+    TypeError,
+    ValueError,
+)
 
 
-# TODO(znado): merge with ExecutionError.
-class StagingError(AutoGraphError):
-  """Raised when AutoGraph has an error while executing a converted function."""
+class ErrorMetadataBase(object):
+  """Container objects attached to exceptions in converted code.
 
-  def __init__(self, user_trace, original_error):
-    """Constructs a StagingError.
+  This metadata allows re-raising exceptions that occur in generated code, with
+  a custom error message that includes a stack trace relative to user-readable
+  code from which the generated code originated.
+  """
 
-    Args:
-      user_trace: Tuple[OriginInfo], the converted call traceback frames.
-      original_error: Exception, the original error thrown.
-    """
-    super(StagingError, self).__init__()
-    self.user_trace = user_trace
-    self.original_error = original_error
+  def __init__(self, callsite_tb, cause_metadata, cause_message, source_map):
+    translated_stack = _stack_trace_inside_mapped_code(callsite_tb, source_map)
 
-  def __str__(self):
-    indent_str = '    '
-    new_stacktrace_lines = []
-    for origin in self.user_trace:
-      if not origin:
-        continue
-      frame_str = indent_str + '{}:{} ({})\n{}    {}'.format(
-          origin.loc.filename, origin.loc.lineno, origin.function_name,
-          indent_str, origin.source_code_line.strip())
-      new_stacktrace_lines.append(frame_str)
-    new_stacktrace_str = '\n'.join(new_stacktrace_lines)
-    original_type = self.original_error.__class__.__name__
-    original_message = str(self.original_error)
-    new_message = original_type + ': ' + original_message
-    return ('\nAn error occurred while executing AutoGraph transformed code. '
-            'For details, set the verbosity to 10 (on Linux, '
-            '`export AUTOGRAPH_VERBOSITY=10`). Corresponding code:\n' +
-            new_stacktrace_str + '\n\n' + indent_str + new_message + '\n\n')
+    if cause_metadata is None:
+      self.translated_stack = translated_stack
+      self.cause_message = cause_message
+    else:
+      # Daisy chain the translated stacks.
+      self.translated_stack = (
+          cause_metadata.translated_stack + (translated_stack[-1],))
+      self.cause_message = cause_metadata.cause_message
 
+  def get_message(self):
+    """Returns the message for the underlying exception."""
 
-def report_internal_error(entity, exception):
-  ag_logging.log(1, 'Error transforming %s', entity, exc_info=True)
-  # TODO(znado): Add external bug reporting instructions.
-  raise AutoGraphError(
-      'Unexpected error transforming %s. If you believe this is due to a bug, '
-      'please set the verbosity to 10 (on Linux, `export '
-      'AUTOGRAPH_VERBOSITY=10`) and attach the full output when filing the bug '
-      'report. Caused by: %s' % (entity, exception))
+    all_paths = tuple(fi.filename for fi in self.translated_stack)
 
+    if len(all_paths) > 1:
+      common_path = os.path.dirname(os.path.commonprefix(all_paths))
+      if common_path == os.path.sep:
+        common_path = ''
+      if common_path:
+        path_idx = len(common_path) + 1
+      else:
+        path_idx = 0
+    else:
+      common_path = ''
+      path_idx = 0
 
-def extract_origin_info(converted_f):
-  """Attempts to use converted_f's source map to get error origin info."""
-  source_map = converted_f.ag_source_map
-  original_traceback = traceback.extract_tb(sys.exc_info()[2])
-  # Can go through all frames and check which ones have origin info in order to
-  # filter for only the locations relevant to converted_f.
-  #
-  # Return the first occurrence of the reversed traceback in the source map in
-  # order to return the innermost frame for this function. We want to do this
-  # because when have a tf.cond we will have multiple matches and we want to
-  # return the last one in this function, because any after that will be in
-  # the next function/frame in the stacktrace.
-  for frame in reversed(original_traceback):
-    converted_loc = origin_info.LineLocation(
-        filename=frame[0], lineno=frame[1])
-    if converted_loc in source_map:
-      return source_map[converted_loc]
-  return None
+    lines = []
+
+    lines.append('in converted code:')
+    if common_path:
+      lines.append('    relative to {}:'.format(common_path))
+
+    lines.append('')
+    for frame_info in reversed(self.translated_stack):
+      lines.append('    {}:{} {}{}'.format(
+          frame_info.filename[path_idx:],
+          frame_info.lineno,
+          frame_info.function_name,
+          '  *' if frame_info.converted else '',
+      ))
+      if frame_info.code is None:
+        code_snippet = '<source unavailable>'
+      else:
+        code_snippet = frame_info.code.strip()
+      lines.append('        {}'.format(code_snippet))
+
+    lines.append('')
+
+    message_lines = self.cause_message.split('\n')
+    for i in range(len(message_lines)):
+      message_lines[i] = '    ' + message_lines[i]
+    lines.extend(message_lines)
+
+    lines.append('')
+
+    return '\n'.join(lines)
+
+  def create_exception(self, preferred_type):
+    if preferred_type in KNOWN_STRING_CONSTRUCTOR_ERRORS:
+      return preferred_type(self.get_message())
+    return None
+
+  def to_exception(self, preferred_type):
+    exc = self.create_exception(preferred_type)
+    exc.__suppress_context__ = True
+    exc.ag_error_metadata = self
+    return exc
diff --git a/tensorflow/python/autograph/pyct/errors_test.py b/tensorflow/python/autograph/pyct/errors_test.py
new file mode 100644
index 00000000000..9ce916ec7fb
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/errors_test.py
@@ -0,0 +1,46 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for errors module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.python.autograph.pyct import errors
+from tensorflow.python.platform import test
+
+
+class ErrorMetadataBaseTest(test.TestCase):
+
+  def test_get_message_when_frame_info_code_is_none(self):
+    callsite_tb = [
+        ('/path/one.py', 11, 'test_fn_1', None),
+        ('/path/two.py', 171, 'test_fn_2', 'test code'),
+    ]
+    cause_message = 'Test message'
+    em = errors.ErrorMetadataBase(
+        callsite_tb=callsite_tb,
+        cause_metadata=None,
+        cause_message=cause_message,
+        source_map={})
+    self.assertRegex(
+        em.get_message(),
+        re.compile('test_fn_1.*test_fn_2.*Test message', re.DOTALL))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/pyct/origin_info.py b/tensorflow/python/autograph/pyct/origin_info.py
index 88aa103a15b..5479fefbb22 100644
--- a/tensorflow/python/autograph/pyct/origin_info.py
+++ b/tensorflow/python/autograph/pyct/origin_info.py
@@ -87,14 +87,16 @@ class OriginInfo(
 
 
 # TODO(mdan): This source map should be a class - easier to refer to.
-# TODO(mdan): Drop indices_in_code.
-def create_source_map(nodes, code, filename):
+def create_source_map(nodes, code, filepath):
   """Creates a source map between an annotated AST and the code it compiles to.
 
+  Note: this function assumes nodes nodes, code and filepath correspond to the
+  same code.
+
   Args:
-    nodes: Iterable[ast.AST, ...]
-    code: Text
-    filename: Optional[Text]
+    nodes: Iterable[ast.AST, ...], one or more AST modes.
+    code: Text, the source code in which nodes are found.
+    filepath: Text
 
   Returns:
     Dict[LineLocation, OriginInfo], mapping locations in code to locations
@@ -102,7 +104,7 @@ def create_source_map(nodes, code, filename):
   """
   reparsed_nodes = parser.parse_str(code, preamble_len=0, single_node=False)
   for node in reparsed_nodes:
-    resolve(node, code)
+    resolve(node, code, filepath, node.lineno, node.col_offset)
 
   source_map = {}
 
@@ -115,7 +117,8 @@ def create_source_map(nodes, code, filename):
       if origin_info is None or final_info is None:
         continue
 
-      line_loc = LineLocation(filename, final_info.loc.lineno)
+      # Note: the keys are by line only, excluding the column offset.
+      line_loc = LineLocation(final_info.loc.filename, final_info.loc.lineno)
 
       existing_origin = source_map.get(line_loc)
       if existing_origin is not None:
@@ -128,11 +131,12 @@ def create_source_map(nodes, code, filename):
           if existing_origin.loc.lineno >= origin_info.loc.lineno:
             continue
 
-        # In case of overlaps, keep the leftmost node.
+        # In case of column overlaps, keep the leftmost node.
         if existing_origin.loc.col_offset <= origin_info.loc.col_offset:
           continue
 
       source_map[line_loc] = origin_info
+
   except ValueError:
     if logging.has_verbosity(3):
       for n, rn in zip(nodes, reparsed_nodes):
@@ -151,56 +155,108 @@ def create_source_map(nodes, code, filename):
   return source_map
 
 
-# TODO(znado): Consider refactoring this into a Visitor.
-# TODO(mdan): Does this work correctly with inner functions?
-def resolve(node, source, function=None):
-  """Adds an origin information to node and its subnodes.
+class _Function(object):
+
+  def __init__(self, name):
+    self.name = name
+
+
+class OriginResolver(gast.NodeVisitor):
+  """Annotates an AST with additional source information like file name."""
+
+  def __init__(self, root_node, source_lines, comments_map,
+               context_lineno, context_col_offset,
+               filepath):
+    self._source_lines = source_lines
+    self._comments_map = comments_map
+
+    self._lineno_offset = context_lineno - root_node.lineno
+    self._col_offset = context_col_offset - root_node.col_offset
+
+    self._filepath = filepath
+
+    self._function_stack = []
+
+  def _absolute_lineno(self, node):
+    return node.lineno + self._lineno_offset
+
+  def _absolute_col_offset(self, node):
+    return node.col_offset + self._col_offset
+
+  def _attach_origin_info(self, node):
+    if self._function_stack:
+      function_name = self._function_stack[-1].name
+    else:
+      function_name = None
+
+    source_code_line = self._source_lines[node.lineno - 1]
+    comment = self._comments_map.get(node.lineno)
+
+    loc = Location(self._filepath, self._absolute_lineno(node),
+                   self._absolute_col_offset(node))
+    origin = OriginInfo(loc, function_name, source_code_line, comment)
+    anno.setanno(node, 'lineno', node.lineno)
+    anno.setanno(node, anno.Basic.ORIGIN, origin)
+
+  def visit(self, node):
+    entered_function = False
+    if isinstance(node, gast.FunctionDef):
+      entered_function = True
+      self._function_stack.append(_Function(node.name))
+
+    if hasattr(node, 'lineno'):
+      self._attach_origin_info(node)
+    self.generic_visit(node)
+
+    if entered_function:
+      self._function_stack.pop()
+
+
+def resolve(node, source, context_filepath, context_lineno, context_col_offset):
+  """Adds origin information to an AST, based on the source it was loaded from.
 
   This allows us to map the original source code line numbers to generated
   source code.
 
-  Args:
-    node: gast.AST node. Should be a gast.FunctionDef. This is the node we
-        annotate with origin information.
-    source: Text, the source code. Should satisfy relationship
-        `node in iter_tree(gast.parse(source))`; otherwise the lineno will be
-        unreliable.
-    function: The original function. If it is None then only the line numbers
-        and column offset will be set in the annotation, with the rest of the
-        information being None.
-  """
-  if function:
-    _, function_lineno = tf_inspect.getsourcelines(function)
-    function_filepath = tf_inspect.getsourcefile(function)
-  else:
-    function_lineno = None
-    function_filepath = None
+  Note: the AST may be a part of a larger context (e.g. a function is part of
+  a module that may contain other things). However, this function does not
+  assume the source argument contains the entire context, nor that it contains
+  only code corresponding to node itself. However, it assumes that node was
+  parsed from the given source code.
+  For this reason, two extra arguments are required, and they indicate the
+  location of the node in the original context.
 
+  Args:
+    node: gast.AST, the AST to annotate.
+    source: Text, the source code representing node.
+    context_filepath: Text
+    context_lineno: int
+    context_col_offset: int
+  """
   # TODO(mdan): Pull this to a separate utility.
   code_reader = six.StringIO(source)
-  comment_map = {}
+  comments_map = {}
   for token in tokenize.generate_tokens(code_reader.readline):
     tok_type, tok_string, loc, _, _ = token
     srow, _ = loc
     if tok_type == tokenize.COMMENT:
-      comment_map[srow] = tok_string.strip()[1:].strip()
+      comments_map[srow] = tok_string.strip()[1:].strip()
 
   source_lines = source.split('\n')
-  for n in gast.walk(node):
-    if not hasattr(n, 'lineno'):
-      continue
+  visitor = OriginResolver(node, source_lines, comments_map,
+                           context_lineno, context_col_offset,
+                           context_filepath)
+  visitor.visit(node)
 
-    within_body_offset = n.lineno - node.lineno
 
-    source_code_line = source_lines[n.lineno - 1]
-    if function:
-      source_lineno = function_lineno + within_body_offset
-      function_name = function.__name__
-    else:
-      source_lineno = n.lineno
-      function_name = None
+def resolve_entity(node, source, entity):
+  """Like resolve, but extracts the context informartion from an entity."""
+  lines, lineno = tf_inspect.getsourcelines(entity)
+  filepath = tf_inspect.getsourcefile(entity)
 
-    location = Location(function_filepath, source_lineno, n.col_offset)
-    origin = OriginInfo(location, function_name,
-                        source_code_line, comment_map.get(source_lineno))
-    anno.setanno(n, anno.Basic.ORIGIN, origin)
+  # Poor man's attempt at guessing the column offset: count the leading
+  # whitespace. This might not work well with tabs.
+  definition_line = lines[0]
+  col_offset = len(definition_line) - len(definition_line.lstrip())
+
+  resolve(node, source, filepath, lineno, col_offset)
diff --git a/tensorflow/python/autograph/pyct/origin_info_test.py b/tensorflow/python/autograph/pyct/origin_info_test.py
index 1837393b79d..91c6ee5778f 100644
--- a/tensorflow/python/autograph/pyct/origin_info_test.py
+++ b/tensorflow/python/autograph/pyct/origin_info_test.py
@@ -21,9 +21,12 @@ from __future__ import print_function
 import textwrap
 
 from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct.testing import basic_definitions
 from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
 
 
 class OriginInfoTest(test.TestCase):
@@ -31,8 +34,8 @@ class OriginInfoTest(test.TestCase):
   def test_create_source_map(self):
 
     source = """
-        def test_fn(x):
-          return x + 1
+      def test_fn(x):
+        return x + 1
     """
     source = textwrap.dedent(source)
 
@@ -50,83 +53,162 @@ class OriginInfoTest(test.TestCase):
     self.assertIn(loc, source_map)
     self.assertIs(source_map[loc], fake_origin)
 
-  def test_create_source_map_multiple_nodes(self):
+  def _create_source_map(self, test_fn):
+    node, source = parser.parse_entity(test_fn, ())
+    origin_info.resolve_entity(node, source, test_fn)
+    # Creating a source map with the source code as output will create
+    # an identity map.
+    return origin_info.create_source_map(node, source, 'test_filename')
 
-    source = """
-        from __future__ import print_function
-        def test_fn(x):
-          return x + 1
-    """
-    source = textwrap.dedent(source)
+  def test_create_source_map_identity(self):
+    test_fn = basic_definitions.simple_function
+    source_map = self._create_source_map(test_fn)
+    module_path = tf_inspect.getsourcefile(test_fn)
 
-    nodes = parser.parse_str(source, single_node=False)
-    fake_import_origin = origin_info.OriginInfo(
-        loc=origin_info.Location('fake_filename', 3, 7),
-        function_name='fake_function_name',
-        source_code_line='fake source line',
-        comment=None)
-    anno.setanno(nodes[0], anno.Basic.ORIGIN, fake_import_origin)
-    fake_function_origin = origin_info.OriginInfo(
-        loc=origin_info.Location('fake_filename', 3, 7),
-        function_name='fake_function_name',
-        source_code_line='fake source line',
-        comment=None)
-    anno.setanno(nodes[1], anno.Basic.ORIGIN, fake_function_origin)
+    # Origin line numbers below should match those in basic_definitions.py
 
-    source_map = origin_info.create_source_map(nodes, source, 'test_filename')
+    definition_loc = origin_info.LineLocation('test_filename', 1)
+    self.assertIn(definition_loc, source_map)
+    self.assertEqual(source_map[definition_loc].loc.lineno, 23)
+    self.assertEqual(source_map[definition_loc].loc.filename, module_path)
+    self.assertEqual(source_map[definition_loc].function_name,
+                     'simple_function')
 
-    loc = origin_info.LineLocation('test_filename', 2)
-    self.assertIn(loc, source_map)
-    self.assertIs(source_map[loc], fake_import_origin)
+  def test_create_source_map_multiline_call(self):
+    test_fn = basic_definitions.function_with_multiline_call
+    source_map = self._create_source_map(test_fn)
+    module_path = tf_inspect.getsourcefile(test_fn)
 
-    loc = origin_info.LineLocation('test_filename', 3)
-    self.assertIn(loc, source_map)
-    self.assertIs(source_map[loc], fake_function_origin)
+    # Origin line numbers below should match those in basic_definitions.py
 
-  def test_source_map_no_origin(self):
+    call_loc = origin_info.LineLocation('test_filename', 3)
+    self.assertIn(call_loc, source_map)
+    self.assertEqual(source_map[call_loc].loc.lineno, 55)
+    self.assertEqual(source_map[call_loc].loc.filename, module_path)
+    self.assertEqual(source_map[call_loc].function_name,
+                     'function_with_multiline_call')
+    self.assertEqual(source_map[call_loc].source_code_line, '  return range(')
 
-    source = """
-        def test_fn(x):
-          return x + 1
-    """
-    source = textwrap.dedent(source)
+    second_arg_loc = origin_info.LineLocation('test_filename', 5)
+    self.assertIn(second_arg_loc, source_map)
+    self.assertEqual(source_map[second_arg_loc].loc.lineno, 57)
+    self.assertEqual(source_map[second_arg_loc].loc.filename, module_path)
+    self.assertEqual(source_map[second_arg_loc].function_name,
+                     'function_with_multiline_call')
+    self.assertEqual(source_map[second_arg_loc].source_code_line,
+                     '      x + 1,')
 
-    node = parser.parse_str(source)
+  def test_create_source_map_no_origin_info(self):
 
-    source_map = origin_info.create_source_map(node, source, 'test_filename')
+    test_fn = basic_definitions.simple_function
+    node, _ = parser.parse_entity(test_fn,
+                                  inspect_utils.getfutureimports(test_fn))
+    # No origin information should result in an empty map.
+    test_fn_lines, _ = tf_inspect.getsourcelines(test_fn)
+    source_map = origin_info.create_source_map(node, '\n'.join(test_fn_lines),
+                                               test_fn)
 
     self.assertEmpty(source_map)
 
   def test_resolve(self):
 
     source = """
-        def test_fn(x):
-          '''Docstring.'''
-          return x  # comment
+      def test_fn(x):
+        '''Docstring.'''
+        return x  # comment
     """
     source = textwrap.dedent(source)
-
     node = parser.parse_str(source)
+    origin_info.resolve(node, source, 'test_file', 10, 10)
 
-    origin_info.resolve(node, source)
+    def_origin = anno.getanno(node, anno.Basic.ORIGIN)
+    self.assertEqual(def_origin.loc.filename, 'test_file')
+    self.assertEqual(def_origin.loc.lineno, 10)
+    self.assertEqual(def_origin.loc.col_offset, 10)
+    self.assertEqual(def_origin.source_code_line, 'def test_fn(x):')
+    self.assertIsNone(def_origin.comment)
 
-    origin = anno.getanno(node, anno.Basic.ORIGIN)
-    self.assertEqual(origin.loc.lineno, 2)
-    self.assertEqual(origin.loc.col_offset, 0)
-    self.assertEqual(origin.source_code_line, 'def test_fn(x):')
-    self.assertIsNone(origin.comment)
+    docstring_origin = anno.getanno(node.body[0], anno.Basic.ORIGIN)
+    self.assertEqual(def_origin.loc.filename, 'test_file')
+    self.assertEqual(docstring_origin.loc.lineno, 11)
+    self.assertEqual(docstring_origin.loc.col_offset, 12)
+    self.assertEqual(docstring_origin.source_code_line, "  '''Docstring.'''")
+    self.assertIsNone(docstring_origin.comment)
 
-    origin = anno.getanno(node.body[0], anno.Basic.ORIGIN)
-    self.assertEqual(origin.loc.lineno, 3)
-    self.assertEqual(origin.loc.col_offset, 2)
-    self.assertEqual(origin.source_code_line, "  '''Docstring.'''")
-    self.assertIsNone(origin.comment)
+    ret_origin = anno.getanno(node.body[1], anno.Basic.ORIGIN)
+    self.assertEqual(def_origin.loc.filename, 'test_file')
+    self.assertEqual(ret_origin.loc.lineno, 12)
+    self.assertEqual(ret_origin.loc.col_offset, 12)
+    self.assertEqual(ret_origin.source_code_line, '  return x  # comment')
+    self.assertEqual(ret_origin.comment, 'comment')
 
-    origin = anno.getanno(node.body[1], anno.Basic.ORIGIN)
-    self.assertEqual(origin.loc.lineno, 4)
-    self.assertEqual(origin.loc.col_offset, 2)
-    self.assertEqual(origin.source_code_line, '  return x  # comment')
-    self.assertEqual(origin.comment, 'comment')
+  def test_resolve_entity(self):
+    test_fn = basic_definitions.simple_function
+    node, source = parser.parse_entity(
+        test_fn, inspect_utils.getfutureimports(test_fn))
+    origin_info.resolve_entity(node, source, test_fn)
+
+    # The line numbers below should match those in basic_definitions.py
+
+    def_origin = anno.getanno(node, anno.Basic.ORIGIN)
+    self.assertEqual(def_origin.loc.lineno, 23)
+    self.assertEqual(def_origin.loc.col_offset, 0)
+    self.assertEqual(def_origin.source_code_line, 'def simple_function(x):')
+    self.assertIsNone(def_origin.comment)
+
+    docstring_origin = anno.getanno(node.body[0], anno.Basic.ORIGIN)
+    self.assertEqual(docstring_origin.loc.lineno, 24)
+    self.assertEqual(docstring_origin.loc.col_offset, 2)
+    self.assertEqual(docstring_origin.source_code_line, '  """Docstring."""')
+    self.assertIsNone(docstring_origin.comment)
+
+    ret_origin = anno.getanno(node.body[1], anno.Basic.ORIGIN)
+    self.assertEqual(ret_origin.loc.lineno, 25)
+    self.assertEqual(ret_origin.loc.col_offset, 2)
+    self.assertEqual(ret_origin.source_code_line, '  return x  # comment')
+    self.assertEqual(ret_origin.comment, 'comment')
+
+  def test_resolve_entity_nested_function(self):
+
+    test_fn = basic_definitions.nested_functions
+    node, source = parser.parse_entity(
+        test_fn, inspect_utils.getfutureimports(test_fn))
+    origin_info.resolve_entity(node, source, test_fn)
+
+    # The line numbers below should match those in basic_definitions.py
+
+    inner_def_origin = anno.getanno(node.body[1], anno.Basic.ORIGIN)
+    self.assertEqual(inner_def_origin.loc.lineno, 31)
+    self.assertEqual(inner_def_origin.loc.col_offset, 2)
+    self.assertEqual(inner_def_origin.source_code_line, '  def inner_fn(y):')
+    self.assertIsNone(inner_def_origin.comment)
+
+    inner_ret_origin = anno.getanno(node.body[1].body[0], anno.Basic.ORIGIN)
+    self.assertEqual(inner_ret_origin.loc.lineno, 32)
+    self.assertEqual(inner_ret_origin.loc.col_offset, 4)
+    self.assertEqual(inner_ret_origin.source_code_line, '    return y')
+    self.assertIsNone(inner_ret_origin.comment)
+
+  def test_resolve_entity_indented_block(self):
+
+    test_fn = basic_definitions.SimpleClass.simple_method
+    node, source = parser.parse_entity(
+        test_fn, inspect_utils.getfutureimports(test_fn))
+    origin_info.resolve_entity(node, source, test_fn)
+
+    # The line numbers below should match those in basic_definitions.py
+
+    def_origin = anno.getanno(node, anno.Basic.ORIGIN)
+    self.assertEqual(def_origin.loc.lineno, 46)
+    self.assertEqual(def_origin.loc.col_offset, 2)
+    self.assertEqual(def_origin.source_code_line, 'def simple_method(self):')
+    self.assertIsNone(def_origin.comment)
+
+    ret_origin = anno.getanno(node.body[0], anno.Basic.ORIGIN)
+    self.assertEqual(ret_origin.loc.lineno, 47)
+    self.assertEqual(ret_origin.loc.col_offset, 4)
+    self.assertEqual(ret_origin.source_code_line, '  return self')
+    self.assertIsNone(ret_origin.comment)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index f1b50f7f633..0cfd95fdfda 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index bf4780219cc..c3bd69ebd34 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -32,9 +32,6 @@ from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
-# TODO(mdan): Add support for PY3 (e.g. Param vs arg).
-# TODO(alexbw): Ignore named literals (e.g. None)
-
 
 class Scope(object):
   """Encloses local symbol definition and usage information.
@@ -208,27 +205,27 @@ class ActivityAnalyzer(transformer.Base):
       if qn.owner_set & set(l.args):
         return
 
-    # When inside a comprehension, ignore any of the comprehensions's targets.
-    # This includes attributes or slices of those arguments.
-    # This is not true in Python2, which leaks symbols.
-    if six.PY3:
-      for l in self.state[_Comprehension]:
-        if qn in l.targets:
-          return
-        if qn.owner_set & set(l.targets):
-          return
+    # When inside a comprehension, ignore reads to any of the comprehensions's
+    # targets. This includes attributes or slices of those arguments.
+    for l in self.state[_Comprehension]:
+      if qn in l.targets:
+        return
+      if qn.owner_set & set(l.targets):
+        return
 
     if isinstance(node.ctx, gast.Store):
       # In comprehensions, modified symbols are the comprehension targets.
-      if six.PY3 and self.state[_Comprehension].level > 0:
-        # Like a lambda's args, they are tracked separately in Python3.
+      if self.state[_Comprehension].level > 0:
         self.state[_Comprehension].targets.add(qn)
-      else:
+
+      # Comprehension targets are completely isolated in Python 3.
+      if six.PY2 or self.state[_Comprehension].level == 0:
         self.scope.mark_modified(qn)
         if qn.is_composite and composite_writes_alter_parent:
           self.scope.mark_modified(qn.parent)
         if self._in_aug_assign:
           self.scope.mark_read(qn)
+
     elif isinstance(node.ctx, gast.Load):
       self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Param):
@@ -281,11 +278,18 @@ class ActivityAnalyzer(transformer.Base):
     return self._process_statement(node)
 
   def visit_AugAssign(self, node):
-    # Special rules for AugAssign. In Assign, the target is only written,
-    # but in AugAssig (e.g. a += b), the target is both read and written.
+    # Special rules for AugAssign. Here, the AST only shows the target as
+    # written, when it is in fact also read.
+    self._enter_scope(False)
+
     self._in_aug_assign = True
-    node = self._process_statement(node)
+    node.target = self.visit(node.target)
     self._in_aug_assign = False
+
+    node.op = self.visit(node.op)
+    node.value = self.visit(node.value)
+    anno.setanno(node, anno.Static.SCOPE, self.scope)
+    self._exit_scope()
     return node
 
   def visit_Delete(self, node):
@@ -368,11 +372,19 @@ class ActivityAnalyzer(transformer.Base):
     # Note: it's important to visit the generators first to properly account
     # for the variables local to these generators. Example: `x` is local to the
     # expression `x for x in y`.
-    node.generators = self.visit_block(node.generators)
+    # It is important to visit the generators in reverse order when targets of
+    # outer comprehensions are accessed by inner generators.
+    node.generators = self.visit_block(reversed(node.generators))
     node.elt = self.visit(node.elt)
     self.state[_Comprehension].exit()
     return node
 
+  def visit_comprehension(self, node):
+    # It is important to visit the target first so that it's properly tracked as
+    # comprehension target.
+    node.target = self.visit(node.target)
+    return self.generic_visit(node)
+
   def visit_DictComp(self, node):
     # Identical to _process_iterable_comprehension, different node names.
     self.state[_Comprehension].enter()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index e037e62c69a..f2800a6521d 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import gast
+import six
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
@@ -470,6 +471,32 @@ class ActivityAnalyzerTest(test.TestCase):
     self.assertScopeIs(body_scope, ('c', 'd'), ('a',))
     self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
 
+  def test_comprehension_targets_are_isolated(self):
+
+    def test_fn(a):
+      b = [c for c in a]  # pylint:disable=unused-variable
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node
+    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    if six.PY2:
+      self.assertScopeIs(body_scope, ('a',), ('b', 'c'))
+    else:
+      self.assertScopeIs(body_scope, ('a',), ('b',))
+
+  def test_comprehension_targets_are_isolated_in_augassign(self):
+
+    def test_fn(a, b):
+      b += [c for c in a]  # pylint:disable=unused-variable
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node
+    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    if six.PY2:
+      self.assertScopeIs(body_scope, ('a', 'b'), ('b', 'c'))
+    else:
+      self.assertScopeIs(body_scope, ('a', 'b'), ('b',))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
index 8a23abef6bc..52639bf9338 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
@@ -378,10 +376,7 @@ class LivenessTest(test.TestCase):
     node = self._parse_and_analyze(test_fn)
     fn_body = node.body
 
-    if six.PY2:
-      self.assertHasLiveIn(fn_body[0], ('all', 'x', 'y'))
-    else:
-      self.assertHasLiveIn(fn_body[0], ('all', 'y'))
+    self.assertHasLiveIn(fn_body[0], ('all', 'y'))
 
   def test_live_in_list_comprehension(self):
 
@@ -392,10 +387,17 @@ class LivenessTest(test.TestCase):
     node = self._parse_and_analyze(test_fn)
     fn_body = node.body
 
-    if six.PY2:
-      self.assertHasLiveIn(fn_body[0], ('x', 'y'))
-    else:
-      self.assertHasLiveIn(fn_body[0], ('y',))
+    self.assertHasLiveIn(fn_body[0], ('y',))
+
+  def test_live_in_list_comprehension_expression(self):
+
+    def test_fn(y, s):
+      s += foo([x for x in y])  # pylint:disable=undefined-variable
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasLiveIn(fn_body[0], ('y', 'foo', 's'))
 
   def test_live_in_set_comprehension(self):
 
@@ -406,10 +408,7 @@ class LivenessTest(test.TestCase):
     node = self._parse_and_analyze(test_fn)
     fn_body = node.body
 
-    if six.PY2:
-      self.assertHasLiveIn(fn_body[0], ('x', 'y'))
-    else:
-      self.assertHasLiveIn(fn_body[0], ('y',))
+    self.assertHasLiveIn(fn_body[0], ('y',))
 
   def test_live_in_dict_comprehension(self):
 
@@ -420,10 +419,7 @@ class LivenessTest(test.TestCase):
     node = self._parse_and_analyze(test_fn)
     fn_body = node.body
 
-    if six.PY2:
-      self.assertHasLiveIn(fn_body[0], ('k', 'v', 'y'))
-    else:
-      self.assertHasLiveIn(fn_body[0], ('y',))
+    self.assertHasLiveIn(fn_body[0], ('y',))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index 3159a3f3d17..1293ee47a40 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
diff --git a/tensorflow/python/autograph/pyct/transformer_test.py b/tensorflow/python/autograph/pyct/transformer_test.py
index 49f8a98b0cd..e3b3e383a41 100644
--- a/tensorflow/python/autograph/pyct/transformer_test.py
+++ b/tensorflow/python/autograph/pyct/transformer_test.py
@@ -355,12 +355,13 @@ class TransformerTest(test.TestCase):
       return x
 
     node, source = parser.parse_entity(test_fn, future_features=())
-    origin_info.resolve(node, source)
+    origin_info.resolve(node, source, 'test_file', 100, 0)
     node = tr.visit(node)
 
     created_pass_node = node.body[1]
+    # Takes the line number of the if statement.
     self.assertEqual(
-        anno.getanno(created_pass_node, anno.Basic.ORIGIN).loc.lineno, 3)
+        anno.getanno(created_pass_node, anno.Basic.ORIGIN).loc.lineno, 102)
 
   def test_origin_info_preserved_in_moved_nodes(self):
 
@@ -379,15 +380,16 @@ class TransformerTest(test.TestCase):
       return x
 
     node, source = parser.parse_entity(test_fn, future_features=())
-    origin_info.resolve(node, source)
+    origin_info.resolve(node, source, 'test_file', 100, 0)
     node = tr.visit(node)
 
     assign_node = node.body[1]
     aug_assign_node = node.body[2]
+    # Keep their original line numbers.
     self.assertEqual(
-        anno.getanno(assign_node, anno.Basic.ORIGIN).loc.lineno, 4)
+        anno.getanno(assign_node, anno.Basic.ORIGIN).loc.lineno, 103)
     self.assertEqual(
-        anno.getanno(aug_assign_node, anno.Basic.ORIGIN).loc.lineno, 5)
+        anno.getanno(aug_assign_node, anno.Basic.ORIGIN).loc.lineno, 104)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index f5e0dbf00bf..69517072a3e 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 08955c63014..01b8b4f38b3 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -671,8 +671,8 @@ class BaseSession(SessionInterface):
       raise TypeError(
           'config must be a tf.ConfigProto, but got %s' % type(config))
 
-    if (mixed_precision_global_state.mixed_precision_is_enabled and
-        config.graph_options.rewrite_options.auto_mixed_precision !=
+    if (mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled
+        and config.graph_options.rewrite_options.auto_mixed_precision !=
         rewriter_config_pb2.RewriterConfig.OFF):
       new_config = config_pb2.ConfigProto()
       new_config.CopyFrom(config)
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index c066331172d..70de97835d3 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -516,6 +516,7 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
 
 %include "tensorflow/c/c_api.h"
+%include "tensorflow/c/tf_attrtype.h"
 %include "tensorflow/c/python_api.h"
 
 
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 4e1be8b202b..89123f03c30 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -147,7 +147,8 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
       Set_TF_Status_from_Status(out_status, s);
       return;
     }
-    py_outputs_safe.emplace_back(make_safe(py_array));
+    py_outputs_safe.emplace_back(
+        make_safe(PyArray_Return(reinterpret_cast<PyArrayObject*>(py_array))));
   }
 
   // 6. If we reach this point, we have successfully built a list of objects
@@ -274,7 +275,8 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle,
       Set_TF_Status_from_Status(out_status, s);
       return;
     }
-    py_outputs_safe.push_back(make_safe(py_array));
+    py_outputs_safe.push_back(
+        make_safe(PyArray_Return(reinterpret_cast<PyArrayObject*>(py_array))));
   }
 
   // If we reach this point, we have successfully built a list of objects
@@ -423,7 +425,8 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
       Set_TF_Status_from_Status(out_status, s);
       return;
     }
-    py_outputs_safe.emplace_back(make_safe(py_array));
+    py_outputs_safe.emplace_back(
+        make_safe(PyArray_Return(reinterpret_cast<PyArrayObject*>(py_array))));
   }
 
   // If we reach this point, we have successfully built a list of objects so we
@@ -672,7 +675,7 @@ PyObject* TF_TryEvaluateConstant_wrapper(TF_Graph* graph, TF_Output output,
   Status s = TF_TensorToPyArray(std::move(safe_result_tensor), &out);
   Set_TF_Status_from_Status(status, s);
   if (!s.ok()) Py_RETURN_NONE;
-  return out;
+  return PyArray_Return(reinterpret_cast<PyArrayObject*>(out));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 87dd5d7f669..8b7f9b5fecf 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 417d9a1c3b2..c65049d5e54 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -27,7 +27,7 @@ import datetime
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 5, 15)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 5, 27)
 
 
 @tf_export("compat.forward_compatible")
diff --git a/tensorflow/python/compiler/BUILD b/tensorflow/python/compiler/BUILD
index ccb95139f90..2079e0fb7ad 100644
--- a/tensorflow/python/compiler/BUILD
+++ b/tensorflow/python/compiler/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # Python APIs for various Tensorflow backends.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 2fd20c0434b..5a3f8addb6f 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -64,6 +64,13 @@ py_library(
         ":trt_convert_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/saved_model:utils",
+        "//tensorflow/python/tools:saved_model_utils",
     ],
 )
 
@@ -99,6 +106,7 @@ cuda_py_tests(
         "test/batch_matmul_test.py",
         "test/biasadd_matmul_test.py",
         "test/binary_tensor_weight_broadcast_test.py",
+        "test/combined_nms_test.py",
         "test/concatenation_test.py",
         "test/const_broadcast_test.py",
         "test/conv2d_test.py",
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index f42bbf27229..3bdb409393d 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -33,43 +33,33 @@ from tensorflow.python.platform import test
 
 class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
-  def GetParams(self):
+  def GraphFn(self, inp):
     """Create a graph containing single segment."""
+    dtype = inp.dtype
+    conv_filter = constant_op.constant(
+        [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+        name="weights",
+        dtype=dtype)
+    conv = nn.conv2d(
+        input=inp,
+        filter=conv_filter,
+        strides=[1, 2, 2, 1],
+        padding="SAME",
+        name="conv")
+    bias = constant_op.constant([4., 1.5, 2., 3., 5., 7.],
+                                name="bias",
+                                dtype=dtype)
+    added = nn.bias_add(conv, bias, name="bias_add")
+    relu = nn.relu(added, "relu")
+    identity = array_ops.identity(relu, "identity")
+    pool = nn_ops.max_pool(
+        identity, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+    return array_ops.squeeze(pool, name="output_0")
+
+  def GetParams(self):
     # TODO(aaroey): test graph with different dtypes.
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [100, 24, 24, 2]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-      with g.device("/GPU:0"):
-        conv_filter = constant_op.constant(
-            [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
-            name="weights",
-            dtype=dtype)
-        conv = nn.conv2d(
-            input=inp,
-            filter=conv_filter,
-            strides=[1, 2, 2, 1],
-            padding="SAME",
-            name="conv")
-        bias = constant_op.constant([4., 1.5, 2., 3., 5., 7.],
-                                    name="bias",
-                                    dtype=dtype)
-        added = nn.bias_add(conv, bias, name="bias_add")
-        relu = nn.relu(added, "relu")
-        identity = array_ops.identity(relu, "identity")
-        pool = nn_ops.max_pool(
-            identity, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-      array_ops.squeeze(pool, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[100, 6, 6, 6]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[100, 24, 24, 2]],
+                            [[100, 6, 6, 6]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -83,50 +73,40 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
-  def GetParams(self):
+  def GraphFn(self, inp):
     """Create a graph containing multiple segment."""
+    dtype = inp.dtype
+    conv_filter = constant_op.constant(
+        [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+        name="weights",
+        dtype=dtype)
+    conv = nn.conv2d(
+        input=inp,
+        filter=conv_filter,
+        strides=[1, 2, 2, 1],
+        padding="SAME",
+        name="conv")
+    c1 = constant_op.constant(
+        np.random.randn(12, 12, 6), dtype=dtype, name="c1")
+    p = math_ops.mul(conv, c1, name="mul")
+    c2 = constant_op.constant(
+        np.random.randn(12, 12, 6), dtype=dtype, name="c2")
+    q = math_ops.div(conv, c2, name="div")
+
+    edge = self.trt_incompatible_op(q, name="incompatible")
+    edge = math_ops.div(edge, edge, name="div1")
+    r = math_ops.add(edge, edge, name="add")
+
+    p = math_ops.sub(p, edge, name="sub")
+    q = math_ops.mul(q, edge, name="mul1")
+    s = math_ops.add(p, q, name="add1")
+    s = math_ops.sub(s, r, name="sub1")
+    return array_ops.squeeze(s, name="output_0")
+
+  def GetParams(self):
     # TODO(aaroey): test graph with different dtypes.
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [100, 24, 24, 2]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtype, shape=input_dims, name=input_name)
-      with g.device("/GPU:0"):
-        conv_filter = constant_op.constant(
-            [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
-            name="weights",
-            dtype=dtype)
-        conv = nn.conv2d(
-            input=inp,
-            filter=conv_filter,
-            strides=[1, 2, 2, 1],
-            padding="SAME",
-            name="conv")
-        c1 = constant_op.constant(
-            np.random.randn(12, 12, 6), dtype=dtype, name="c1")
-        p = math_ops.mul(conv, c1, name="mul")
-        c2 = constant_op.constant(
-            np.random.randn(12, 12, 6), dtype=dtype, name="c2")
-        q = math_ops.div(conv, c2, name="div")
-
-        edge = self.trt_incompatible_op(q, name="incompatible")
-        edge = math_ops.div(edge, edge, name="div1")
-        r = math_ops.add(edge, edge, name="add")
-
-        p = math_ops.sub(p, edge, name="sub")
-        q = math_ops.mul(q, edge, name="mul1")
-        s = math_ops.add(p, q, name="add1")
-        s = math_ops.sub(s, r, name="sub1")
-      array_ops.squeeze(s, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[100, 12, 12, 6]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[100, 24, 24, 2]],
+                            [[100, 12, 12, 6]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -149,36 +129,27 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
 class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
 
-  def GetParams(self):
+  def GraphFn(self, inp):
     """Create a graph containing two segment."""
-    input_name = "input"
-    input_dims = [2, 32, 32, 3]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtypes.float32, shape=input_dims, name=input_name)
-      with g.device("/GPU:0"):
-        n = inp
-        for i in range(2):
-          c = constant_op.constant(1.0, name="c%d" % i)
-          n = math_ops.add(n, c, name="add%d" % i)
-          n = math_ops.mul(n, n, name="mul%d" % i)
-        edge = self.trt_incompatible_op(n, name="incompatible")
-        with g.control_dependencies([edge]):
-          c = constant_op.constant(1.0, name="c2")
-          n = math_ops.add(n, c, name="add2")
-        n = math_ops.mul(n, n, name="mul2")
-        c = constant_op.constant(1.0, name="c3")
-        n = math_ops.add(n, c, name="add3")
-        n = math_ops.mul(n, n, name="mul3")
-      array_ops.squeeze(n, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[input_dims]])
+    n = inp
+    for i in range(2):
+      c = constant_op.constant(1.0, name="c%d" % i)
+      n = math_ops.add(n, c, name="add%d" % i)
+      n = math_ops.mul(n, n, name="mul%d" % i)
+    edge = self.trt_incompatible_op(n, name="incompatible")
+    with ops.control_dependencies([edge]):
+      c = constant_op.constant(1.0, name="c2")
+      n = math_ops.add(n, c, name="add2")
+    n = math_ops.mul(n, n, name="mul2")
+    c = constant_op.constant(1.0, name="c3")
+    n = math_ops.add(n, c, name="add3")
+    n = math_ops.mul(n, n, name="mul3")
+    return array_ops.squeeze(n, name="output_0")
+
+  def GetParams(self):
+    shapes = [[2, 32, 32, 3]]
+    return self.BuildParams(self.GraphFn, dtypes.float32, input_shapes=shapes,
+                            output_shapes=shapes)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -198,40 +169,31 @@ class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
 
 class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
 
-  def GetParams(self):
+  def GraphFn(self, inp):
     """Create a graph containing multiple segment."""
-    input_name = "input"
-    input_dims = [2, 32, 32, 3]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtypes.float32, shape=input_dims, name=input_name)
-      with g.device("/GPU:0"):
-        n = inp
-        c = constant_op.constant(1.0, name="c")
-        # Adds control dependency from the constant op to a trt incompatible op,
-        # and adds control dependency from the trt incompatible op to all other
-        # ops, to make sure the constant op cannot be contracted with any trt
-        # segment that depends on it.
-        with g.control_dependencies([c]):
-          d = self.trt_incompatible_op(n, name="incompatible")
-        with g.control_dependencies([d]):
-          n = math_ops.add(n, c, name="add")
-          n = math_ops.mul(n, n, name="mul")
-          n = math_ops.add(n, n, name="add1")
-        n = self.trt_incompatible_op(n, name="incompatible1")
-        with g.control_dependencies([d]):
-          n = math_ops.add(n, c, name="add2")
-          n = math_ops.mul(n, n, name="mul1")
-          n = math_ops.add(n, n, name="add3")
-      array_ops.squeeze(n, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[input_dims]])
+    n = inp
+    c = constant_op.constant(1.0, name="c")
+    # Adds control dependency from the constant op to a trt incompatible op,
+    # and adds control dependency from the trt incompatible op to all other
+    # ops, to make sure the constant op cannot be contracted with any trt
+    # segment that depends on it.
+    with ops.control_dependencies([c]):
+      d = self.trt_incompatible_op(n, name="incompatible")
+    with ops.control_dependencies([d]):
+      n = math_ops.add(n, c, name="add")
+      n = math_ops.mul(n, n, name="mul")
+      n = math_ops.add(n, n, name="add1")
+    n = self.trt_incompatible_op(n, name="incompatible1")
+    with ops.control_dependencies([d]):
+      n = math_ops.add(n, c, name="add2")
+      n = math_ops.mul(n, n, name="mul1")
+      n = math_ops.add(n, n, name="add3")
+    return array_ops.squeeze(n, name="output_0")
+
+  def GetParams(self):
+    shapes = [[2, 32, 32, 3]]
+    return self.BuildParams(self.GraphFn, dtypes.float32, input_shapes=shapes,
+                            output_shapes=shapes)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -243,28 +205,19 @@ class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
 
 class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
-  def GetParams(self):
+  def GraphFn(self, inp):
     """Create a graph containing single segment."""
-    input_name = "input"
-    input_dims = [2, 32, 32, 3]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtypes.float32, shape=input_dims, name=input_name)
-      with g.device("/GPU:0"):
-        n = inp
-        c = constant_op.constant(1.0, name="c")
-        n = math_ops.add(n, c, name="add")
-        n = math_ops.mul(n, n, name="mul")
-        n = math_ops.add(n, n, name="add1")
-      array_ops.squeeze(n, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[input_dims]])
+    n = inp
+    c = constant_op.constant(1.0, name="c")
+    n = math_ops.add(n, c, name="add")
+    n = math_ops.mul(n, n, name="mul")
+    n = math_ops.add(n, n, name="add1")
+    return array_ops.squeeze(n, name="output_0")
+
+  def GetParams(self):
+    shapes = [[2, 32, 32, 3]]
+    return self.BuildParams(self.GraphFn, dtypes.float32, input_shapes=shapes,
+                            output_shapes=shapes)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -273,32 +226,23 @@ class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
 class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
-  def GetParams(self):
+  def GraphFn(self, inp):
     """Create a graph containing multiple segment."""
-    input_name = "input"
-    input_dims = [2, 32, 32, 3]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtypes.float32, shape=input_dims, name=input_name)
-      with g.device("/GPU:0"):
-        n = inp
-        c = constant_op.constant(1.0, name="c")
-        n = math_ops.add(n, c, name="add")
-        n = math_ops.mul(n, n, name="mul")
-        n = math_ops.add(n, n, name="add1")
-        n = self.trt_incompatible_op(n, name="incompatible1")
-        n = math_ops.add(n, c, name="add2")
-        n = math_ops.mul(n, n, name="mul1")
-        n = math_ops.add(n, n, name="add3")
-      array_ops.squeeze(n, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[input_dims]])
+    n = inp
+    c = constant_op.constant(1.0, name="c")
+    n = math_ops.add(n, c, name="add")
+    n = math_ops.mul(n, n, name="mul")
+    n = math_ops.add(n, n, name="add1")
+    n = self.trt_incompatible_op(n, name="incompatible1")
+    n = math_ops.add(n, c, name="add2")
+    n = math_ops.mul(n, n, name="mul1")
+    n = math_ops.add(n, n, name="add3")
+    return array_ops.squeeze(n, name="output_0")
+
+  def GetParams(self):
+    shapes = [[2, 32, 32, 3]]
+    return self.BuildParams(self.GraphFn, dtypes.float32, input_shapes=shapes,
+                            output_shapes=shapes)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -314,40 +258,31 @@ class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
 class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
 
-  def GetParams(self):
+  def GraphFn(self, inp):
     """Create a graph containing multiple segment."""
-    input_name = "input"
-    input_dims = [2, 32, 32, 3]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtypes.float32, shape=input_dims, name=input_name)
-      with g.device("/GPU:0"):
-        c1 = constant_op.constant(1.0, name="c1")
-        c2 = constant_op.constant(1.0, name="c2")
-        d1 = constant_op.constant(1.0, name="d1")
-        d2 = self.trt_incompatible_op(inp, name="d2")
-        with g.control_dependencies([d1, d2]):
-          add = math_ops.add(inp, c1, name="add")
-        with g.control_dependencies([d1, d2]):
-          mul = math_ops.mul(add, add, name="mul")
-        with g.control_dependencies([d1, d2]):
-          add1 = math_ops.add(mul, mul, name="add1")
-        edge = self.trt_incompatible_op(add1, name="incompatible")
-        with g.control_dependencies([d1, d2, add, mul]):
-          add2 = math_ops.add(edge, c2, name="add2")
-        with g.control_dependencies([d1, d2, add1, mul]):
-          mul1 = math_ops.mul(add2, add2, name="mul1")
-        with g.control_dependencies([d1, d2, add, add1]):
-          add3 = math_ops.add(mul1, mul1, name="add3")
-      array_ops.squeeze(add3, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[input_dims]])
+    c1 = constant_op.constant(1.0, name="c1")
+    c2 = constant_op.constant(1.0, name="c2")
+    d1 = constant_op.constant(1.0, name="d1")
+    d2 = self.trt_incompatible_op(inp, name="d2")
+    with ops.control_dependencies([d1, d2]):
+      add = math_ops.add(inp, c1, name="add")
+    with ops.control_dependencies([d1, d2]):
+      mul = math_ops.mul(add, add, name="mul")
+    with ops.control_dependencies([d1, d2]):
+      add1 = math_ops.add(mul, mul, name="add1")
+    edge = self.trt_incompatible_op(add1, name="incompatible")
+    with ops.control_dependencies([d1, d2, add, mul]):
+      add2 = math_ops.add(edge, c2, name="add2")
+    with ops.control_dependencies([d1, d2, add1, mul]):
+      mul1 = math_ops.mul(add2, add2, name="mul1")
+    with ops.control_dependencies([d1, d2, add, add1]):
+      add3 = math_ops.add(mul1, mul1, name="add3")
+    return array_ops.squeeze(add3, name="output_0")
+
+  def GetParams(self):
+    shapes = [[2, 32, 32, 3]]
+    return self.BuildParams(self.GraphFn, dtypes.float32, input_shapes=shapes,
+                            output_shapes=shapes)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
index f7dc210920d..ab4611b12ba 100644
--- a/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
@@ -31,49 +30,35 @@ from tensorflow.python.platform import test
 
 
 class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing conversion of BatchMatMul in TF-TRT conversion."""
+
+  def GraphFn(self, inp, inp1, inp2):
+    dtype = inp.dtype
+    b = constant_op.constant(np.random.randn(12, 5, 12, 7), dtype=dtype)
+    x1 = math_ops.matmul(inp, b)
+    c = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
+    x1 = x1 + c
+
+    x2 = math_ops.matmul(inp, inp1)
+    d = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
+    x2 = x2 * d
+
+    e = self.trt_incompatible_op(inp)
+    e = gen_array_ops.reshape(e, [12, 40, 12])
+    x3 = math_ops.matmul(e, inp2)
+    f = constant_op.constant(np.random.randn(40, 1), dtype=dtype)
+    x3 = x3 + f
+    x3 = gen_array_ops.reshape(x3, [12, 5, 8, 7])
+    x3 = self.trt_incompatible_op(x3)
+
+    out = x1 + x2 + x3
+    return array_ops.squeeze(out, name="output_0")
 
   def GetParams(self):
-    """Testing conversion of BatchMatMul in TF-TRT conversion."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [12, 5, 8, 12]
-    output_name = "output"
-    w1_name = "matmul_w1"
-    w1_dims = [12, 5, 12, 7]
-    w2_name = "matmul_w2"
-    w2_dims = [12, 12, 7]
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-      w1 = array_ops.placeholder(dtype=dtype, shape=w1_dims, name=w1_name)
-      w2 = array_ops.placeholder(dtype=dtype, shape=w2_dims, name=w2_name)
-      with g.device("/GPU:0"):
-        b = constant_op.constant(np.random.randn(12, 5, 12, 7), dtype=dtype)
-        x1 = math_ops.matmul(inp, b)
-        c = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
-        x1 = x1 + c
-
-        x2 = math_ops.matmul(inp, w1)
-        d = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
-        x2 = x2 * d
-
-        e = self.trt_incompatible_op(inp)
-        e = gen_array_ops.reshape(e, [12, 40, 12])
-        x3 = math_ops.matmul(e, w2)
-        f = constant_op.constant(np.random.randn(40, 1), dtype=dtype)
-        x3 = x3 + f
-        x3 = gen_array_ops.reshape(x3, [12, 5, 8, 7])
-        x3 = self.trt_incompatible_op(x3)
-
-        out = x1 + x2 + x3
-      array_ops.squeeze(out, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(add_shapes=True),
-        input_names=[input_name, w1_name, w2_name],
-        input_dims=[[input_dims, w1_dims, w2_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[12, 5, 8, 7]]])
+    # TODO(aaroey): test graph with different dtypes.
+    return self.BuildParams(self.GraphFn, dtypes.float32,
+                            [[12, 5, 8, 12], [12, 5, 12, 7], [12, 12, 7]],
+                            [[12, 5, 8, 7]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
index 69d4ab0e297..680a06988ed 100644
--- a/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
@@ -32,88 +31,79 @@ from tensorflow.python.platform import test
 
 
 class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing conversion of BiasAdd MatMul in TF-TRT conversion."""
 
   def _ConstOp(self, shape):
     return constant_op.constant(np.random.randn(*shape), dtype=dtypes.float32)
 
-  def GetParams(self):
-    """Testing conversion of BiasAdd MatMul in TF-TRT conversion."""
-    input_name = "input"
+  def GraphFn(self, x):
     input_matrix_rows = 4
     input_matrix_columns = 144
+
+    b = self._ConstOp((input_matrix_columns, 4))
+    x1 = math_ops.matmul(x, b)
+    b = self._ConstOp((1, 4))
+    x1 = x1 + b
+
+    b = self._ConstOp((input_matrix_rows, 144))
+    x2 = self.trt_incompatible_op(x)
+    x2 = math_ops.matmul(x2, b, transpose_a=True)
+    x2 = gen_array_ops.reshape(x2, [4, -1])
+    x2 = self.trt_incompatible_op(x2)
+
+    b = self._ConstOp((4, input_matrix_columns))
+    x3 = math_ops.matmul(x, b, transpose_b=True)
+
+    b = self._ConstOp((16, input_matrix_rows))
+    x4 = self.trt_incompatible_op(x)
+    x4 = math_ops.matmul(x4, b, transpose_b=True, transpose_a=True)
+    x4 = gen_array_ops.reshape(x4, [4, -1])
+    x4 = self.trt_incompatible_op(x4)
+
     # Note that tf.nn.bias_add supports up to 5 dimensions.
-    input_dims = [input_matrix_rows, input_matrix_columns]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(
-          dtype=dtypes.float32, shape=input_dims, name=input_name)
+    b = self._ConstOp((input_matrix_columns, 48))
+    x5 = math_ops.matmul(x, b)
+    b = self._ConstOp((48,))
+    x5 = nn.bias_add(x5, b)
+    x5 = gen_array_ops.reshape(x5, [4, -1])
 
-      b = self._ConstOp((input_matrix_columns, 4))
-      x1 = math_ops.matmul(x, b)
-      b = self._ConstOp((1, 4))
-      x1 = x1 + b
+    x6 = gen_array_ops.reshape(x, [4, 24, 6])
+    b = self._ConstOp((6,))
+    x6 = nn.bias_add(x6, b, data_format="NHWC")
+    x6 = gen_array_ops.reshape(x6, [4, -1])
 
-      b = self._ConstOp((input_matrix_rows, 144))
-      x2 = self.trt_incompatible_op(x)
-      x2 = math_ops.matmul(x2, b, transpose_a=True)
-      x2 = gen_array_ops.reshape(x2, [4, -1])
-      x2 = self.trt_incompatible_op(x2)
+    x7 = gen_array_ops.reshape(x, [4, 12, 4, 3])
+    b = self._ConstOp((3,))
+    x7 = nn.bias_add(x7, b, data_format="NHWC")
+    x7 = gen_array_ops.reshape(x7, [4, -1])
 
-      b = self._ConstOp((4, input_matrix_columns))
-      x3 = math_ops.matmul(x, b, transpose_b=True)
+    x8 = gen_array_ops.reshape(x, [4, 4, 3, 2, 6])
+    b = self._ConstOp((6,))
+    x8 = nn.bias_add(x8, b, data_format="NHWC")
+    x8 = gen_array_ops.reshape(x8, [4, -1])
 
-      b = self._ConstOp((16, input_matrix_rows))
-      x4 = self.trt_incompatible_op(x)
-      x4 = math_ops.matmul(x4, b, transpose_b=True, transpose_a=True)
-      x4 = gen_array_ops.reshape(x4, [4, -1])
-      x4 = self.trt_incompatible_op(x4)
+    x9 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
+    b = self._ConstOp((12,))
+    x9 = nn.bias_add(x9, b, data_format="NCHW")
+    x9 = gen_array_ops.reshape(x9, [4, -1])
 
-      b = self._ConstOp((input_matrix_columns, 48))
-      x5 = math_ops.matmul(x, b)
-      b = self._ConstOp((48,))
-      x5 = nn.bias_add(x5, b)
-      x5 = gen_array_ops.reshape(x5, [4, -1])
+    x10 = gen_array_ops.reshape(x, [4, 3, 4, 12])
+    b = self._ConstOp((3,))
+    x10 = nn.bias_add(x10, b, data_format="NCHW")
+    x10 = gen_array_ops.reshape(x10, [4, -1])
 
-      x6 = gen_array_ops.reshape(x, [4, 24, 6])
-      b = self._ConstOp((6,))
-      x6 = nn.bias_add(x6, b, data_format="NHWC")
-      x6 = gen_array_ops.reshape(x6, [4, -1])
+    x11 = gen_array_ops.reshape(x, [4, 6, 24])
+    b = self._ConstOp((6,))
+    x11 = nn.bias_add(x11, b, data_format="NCHW")
+    x11 = gen_array_ops.reshape(x11, [4, -1])
 
-      x7 = gen_array_ops.reshape(x, [4, 12, 4, 3])
-      b = self._ConstOp((3,))
-      x7 = nn.bias_add(x7, b, data_format="NHWC")
-      x7 = gen_array_ops.reshape(x7, [4, -1])
+    out = array_ops.concat([x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11],
+                           axis=-1)
+    return array_ops.squeeze(out, name="output_0")
 
-      x8 = gen_array_ops.reshape(x, [4, 4, 3, 2, 6])
-      b = self._ConstOp((6,))
-      x8 = nn.bias_add(x8, b, data_format="NHWC")
-      x8 = gen_array_ops.reshape(x8, [4, -1])
-
-      x9 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
-      b = self._ConstOp((12,))
-      x9 = nn.bias_add(x9, b, data_format="NCHW")
-      x9 = gen_array_ops.reshape(x9, [4, -1])
-
-      x10 = gen_array_ops.reshape(x, [4, 3, 4, 12])
-      b = self._ConstOp((3,))
-      x10 = nn.bias_add(x10, b, data_format="NCHW")
-      x10 = gen_array_ops.reshape(x10, [4, -1])
-
-      x11 = gen_array_ops.reshape(x, [4, 6, 24])
-      b = self._ConstOp((6,))
-      x11 = nn.bias_add(x11, b, data_format="NCHW")
-      x11 = gen_array_ops.reshape(x11, [4, -1])
-
-      out = array_ops.concat([x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11],
-                             axis=-1)
-      out = array_ops.squeeze(out, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[4, 6680]]])
+  def GetParams(self):
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[4, 144]],
+                            [[4, 6680]])
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
diff --git a/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
index 7e1d3afdd93..9e31327f580 100644
--- a/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -23,49 +23,39 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
 
 
 class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
+  """Tests for scale & elementwise layers in TF-TRT."""
 
   def _ConstOp(self, shape):
     return constant_op.constant(np.random.randn(*shape), dtype=dtypes.float32)
 
+  def GraphFn(self, x):
+    for weights_shape in [
+        (1,),  # scale
+        (24, 1, 1),  # scale
+        (24, 24, 20),  # scale
+        (20,),  # elementwise
+        (1, 24, 1, 1),  # elementwise
+        (1, 24, 24, 1),  # elementwise
+        (1, 24, 24, 20),  # elementwise
+        (24, 20),  # elementwise
+    ]:
+      a = self._ConstOp(weights_shape)
+      f = x + a
+      x = self.trt_incompatible_op(f)
+      a = self._ConstOp(weights_shape)
+      f = a + x
+      x = self.trt_incompatible_op(f)
+    return gen_array_ops.reshape(x, [5, -1], name="output_0")
+
   def GetParams(self):
-    """Tests for scale & elementwise layers in TF-TRT."""
-    input_name = "input"
-    input_dims = [10, 24, 24, 20]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(
-          dtype=dtypes.float32, shape=input_dims, name=input_name)
-      for weights_shape in [
-          (1,),  # scale
-          (24, 1, 1),  # scale
-          (24, 24, 20),  # scale
-          (20,),  # elementwise
-          (1, 24, 1, 1),  # elementwise
-          (1, 24, 24, 1),  # elementwise
-          (1, 24, 24, 20),  # elementwise
-          (24, 20),  # elementwise
-      ]:
-        a = self._ConstOp(weights_shape)
-        f = x + a
-        x = self.trt_incompatible_op(f)
-        a = self._ConstOp(weights_shape)
-        f = a + x
-        x = self.trt_incompatible_op(f)
-      gen_array_ops.reshape(x, [5, -1], name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[5, 23040]]])
+    # TODO(aaroey): test graph with different dtypes.
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[10, 24, 24, 20]],
+                            [[5, 23040]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
new file mode 100644
index 00000000000..5e09b1423e5
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
@@ -0,0 +1,105 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT conversion of CombinedNMS op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops_impl
+from tensorflow.python.platform import test
+
+
+class CombinedNmsTest(trt_test.TfTrtIntegrationTestBase):
+  """Test for CombinedNMS op in TF-TRT."""
+
+  def GraphFn(self, boxes, scores):
+    max_output_size_per_class = 3
+    max_total_size = 3
+    score_threshold = 0.1
+    iou_threshold = 0.5
+    # Shapes
+    max_output_size_per_class_tensor = constant_op.constant(
+        max_output_size_per_class,
+        dtype=dtypes.int32,
+        name='max_output_size_per_class')
+    max_total_size_tensor = constant_op.constant(
+        max_total_size, dtype=dtypes.int32, name='max_total_size')
+    iou_threshold_tensor = constant_op.constant(
+        iou_threshold, dtype=dtypes.float32, name='iou_threshold')
+    score_threshold_tensor = constant_op.constant(
+        score_threshold, dtype=dtypes.float32, name='score_threshold')
+    nms_output = image_ops_impl.combined_non_max_suppression(
+        boxes,
+        scores,
+        max_output_size_per_class_tensor,
+        max_total_size_tensor,
+        iou_threshold_tensor,
+        score_threshold_tensor,
+        name='combined_nms')
+    return [
+        array_ops.identity(output, name=('output_%d' % i))
+        for i, output in enumerate(nms_output)
+    ]
+
+  def GetParams(self):
+    # Parameters
+    q = 1
+    batch_size = 1
+    num_boxes = 200
+    num_classes = 2
+    max_total_size = 3
+
+    boxes_shape = [batch_size, num_boxes, q, 4]
+    scores_shape = [batch_size, num_boxes, num_classes]
+    nmsed_boxes_shape = [batch_size, max_total_size, 4]
+    nmsed_scores_shape = [batch_size, max_total_size]
+    nmsed_classes_shape = [batch_size, max_total_size]
+    valid_detections_shape = [batch_size]
+    return self.BuildParams(self.GraphFn, dtypes.float32,
+                            [boxes_shape, scores_shape], [
+                                nmsed_boxes_shape, nmsed_scores_shape,
+                                nmsed_classes_shape, valid_detections_shape
+                            ])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        'TRTEngineOp_0': [
+            'combined_nms/CombinedNonMaxSuppression',
+            'max_output_size_per_class', 'max_total_size', 'iou_threshold',
+            'score_threshold'
+        ]
+    }
+
+  def ShouldRunTest(self, run_params):
+    # There is no CombinedNonMaxSuppression op for GPU at the moment, so
+    # calibration will fail.
+    # TODO(laigd): fix this.
+    if trt_test.IsQuantizationMode(run_params.precision_mode):
+      return False
+
+    # Only run for TRT 5.1 and above.
+    ver = get_linked_tensorrt_version()
+    return ver[0] > 5 or (ver[0] == 5 and ver[1] >= 1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/concatenation_test.py b/tensorflow/python/compiler/tensorrt/test/concatenation_test.py
index f30324e7dba..cf012936d7c 100644
--- a/tensorflow/python/compiler/tensorrt/test/concatenation_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/concatenation_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
@@ -31,51 +30,43 @@ from tensorflow.python.platform import test
 
 
 class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing Concatenation in TF-TRT conversion."""
+
+  def GraphFn(self, x):
+    dtype = x.dtype
+    # scale
+    a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+    r1 = x / a
+    a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+    r2 = a / x
+    a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype)
+    r3 = a + x
+    a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype)
+    r4 = x * a
+    a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+    r5 = x - a
+    a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+    r6 = a - x
+    a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+    r7 = x - a
+    a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+    r8 = a - x
+    a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+    r9 = gen_math_ops.maximum(x, a)
+    a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+    r10 = gen_math_ops.minimum(a, x)
+    a = constant_op.constant(np.random.randn(3), dtype=dtype)
+    r11 = x * a
+    a = constant_op.constant(np.random.randn(1), dtype=dtype)
+    r12 = a * x
+    concat1 = array_ops.concat([r1, r2, r3, r4, r5, r6], axis=-1)
+    concat2 = array_ops.concat([r7, r8, r9, r10, r11, r12], axis=3)
+    x = array_ops.concat([concat1, concat2], axis=-1)
+    return gen_array_ops.reshape(x, [2, -1], name="output_0")
 
   def GetParams(self):
-    """Testing Concatenation in TF-TRT conversion."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [2, 3, 3, 1]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      # scale
-      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
-      r1 = x / a
-      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
-      r2 = a / x
-      a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype)
-      r3 = a + x
-      a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype)
-      r4 = x * a
-      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
-      r5 = x - a
-      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
-      r6 = a - x
-      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
-      r7 = x - a
-      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
-      r8 = a - x
-      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
-      r9 = gen_math_ops.maximum(x, a)
-      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
-      r10 = gen_math_ops.minimum(a, x)
-      a = constant_op.constant(np.random.randn(3), dtype=dtype)
-      r11 = x * a
-      a = constant_op.constant(np.random.randn(1), dtype=dtype)
-      r12 = a * x
-      concat1 = array_ops.concat([r1, r2, r3, r4, r5, r6], axis=-1)
-      concat2 = array_ops.concat([r7, r8, r9, r10, r11, r12], axis=3)
-      x = array_ops.concat([concat1, concat2], axis=-1)
-      gen_array_ops.reshape(x, [2, -1], name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[2, 126]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[2, 3, 3, 1]],
+                            [[2, 126]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
index 2d764665bef..8024bc87678 100644
--- a/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
@@ -23,44 +23,34 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
 
 class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
+  """Test for Constant broadcasting in TF-TRT."""
+
+  def GraphFn(self, x):
+    dtype = x.dtype
+    filt1 = constant_op.constant(
+        0.3, shape=(3, 3, 2, 1), dtype=dtype, name='filt1')
+    y1 = nn.conv2d(x, filt1, strides=[1, 1, 1, 1], padding='SAME', name='y1')
+    z1 = nn.relu(y1, name='z1')
+    filt2 = constant_op.constant(
+        np.random.randn(9), shape=(3, 3, 1, 1), dtype=dtype, name='filt2')
+    y2 = nn.conv2d(z1, filt2, strides=[1, 1, 1, 1], padding='SAME', name='y2')
+    z2 = nn.relu(y2, name='z')
+    filt3 = constant_op.constant(
+        np.random.randn(3, 3, 1, 1),
+        shape=(3, 3, 1, 1),
+        dtype=dtype,
+        name='filt3')
+    y3 = nn.conv2d(z2, filt3, strides=[1, 1, 1, 1], padding='SAME', name='y3')
+    return nn.relu(y3, name='output_0')
 
   def GetParams(self):
-    """Test for Constant broadcasting in TF-TRT."""
-    dtype = dtypes.float32
-    input_name = 'input'
-    input_dims = [5, 12, 12, 2]
-    output_name = 'output'
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      filt1 = constant_op.constant(
-          0.3, shape=(3, 3, 2, 1), dtype=dtype, name='filt1')
-      y1 = nn.conv2d(x, filt1, strides=[1, 1, 1, 1], padding='SAME', name='y1')
-      z1 = nn.relu(y1, name='z1')
-      filt2 = constant_op.constant(
-          np.random.randn(9), shape=(3, 3, 1, 1), dtype=dtype, name='filt2')
-      y2 = nn.conv2d(z1, filt2, strides=[1, 1, 1, 1], padding='SAME', name='y2')
-      z2 = nn.relu(y2, name='z')
-      filt3 = constant_op.constant(
-          np.random.randn(3, 3, 1, 1),
-          shape=(3, 3, 1, 1),
-          dtype=dtype,
-          name='filt3')
-      y3 = nn.conv2d(z2, filt3, strides=[1, 1, 1, 1], padding='SAME', name='y3')
-      nn.relu(y3, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[5, 12, 12, 1]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[5, 12, 12, 2]],
+                            [[5, 12, 12, 1]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
index 326cad52974..67fd3ceb027 100644
--- a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import nn_ops
@@ -36,8 +35,7 @@ def conv2d_layer(inputs,
                  strides=(1, 1),
                  padding="valid",
                  data_format="channels_last",
-                 dilation_rate=(1, 1),
-                 name=None):
+                 dilation_rate=(1, 1)):
   dtype = inputs.dtype
   c_axis = -1 if data_format == "channels_last" else 1
   nchan = inputs.shape[c_axis]
@@ -65,48 +63,40 @@ def div_round_up(n, d):
   return (n - 1) // d + 1
 
 
-def build_graph(input_dims,
+def build_graph(inp,
                 dtype,
                 num_filters,
                 data_format,
                 kernel_sizes,
                 dilation_rates,
                 padding="same"):
-  g = ops.Graph()
-  with g.as_default():
-    inp = array_ops.placeholder(
-        dtype=dtype, shape=[None] + input_dims[1:], name="input")
-    with g.device("/GPU:0"):
-      results = []
-      for kernel_size in kernel_sizes:
-        for dilation_rate in dilation_rates:
-          result = conv2d_layer(inp, num_filters, kernel_size, (1, 1), padding,
-                                data_format, dilation_rate)
-          results.append(result)
-      output = sum(results)
-      output = array_ops.identity(output, name="output")
-  return g
+  results = []
+  for kernel_size in kernel_sizes:
+    for dilation_rate in dilation_rates:
+      result = conv2d_layer(inp, num_filters, kernel_size, (1, 1), padding,
+                            data_format, dilation_rate)
+      results.append(result)
+  output = sum(results)
+  return array_ops.identity(output, name="output_0")
 
 
 class Conv2DNCHWTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
 
-  def GetParams(self):
-    """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
+  def GraphFn(self, inp):
     np.random.seed(1234)
-    input_dims = [13, 3, 7, 11]
-    g = build_graph(
-        input_dims=input_dims,
+    return build_graph(
+        inp=inp,
         dtype=dtypes.float32,
         num_filters=5,
         data_format="channels_first",
         kernel_sizes=[(3, 3), (3, 2)],
         dilation_rates=[(1, 1), (2, 3)])
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=["input"],
-        input_dims=[[input_dims]],
-        output_names=["output"],
-        expected_output_dims=[[[13, 5, 7, 11]]])
+
+  def GetParams(self):
+    # TODO(aaroey): test graph with different dtypes.
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[13, 3, 7, 11]],
+                            [[13, 5, 7, 11]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -114,24 +104,22 @@ class Conv2DNCHWTest(trt_test.TfTrtIntegrationTestBase):
 
 
 class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
 
-  def GetParams(self):
-    """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
+  def GraphFn(self, inp):
     np.random.seed(1234)
-    input_dims = [13, 7, 11, 3]
-    g = build_graph(
-        input_dims=input_dims,
+    return build_graph(
+        inp=inp,
         dtype=dtypes.float32,
         num_filters=5,
         data_format="channels_last",
         kernel_sizes=[(3, 3), (3, 2)],
         dilation_rates=[(1, 1), (2, 3)])
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=["input"],
-        input_dims=[[input_dims]],
-        output_names=["output"],
-        expected_output_dims=[[[13, 7, 11, 5]]])
+
+  def GetParams(self):
+    # TODO(aaroey): test graph with different dtypes.
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[13, 7, 11, 3]],
+                            [[13, 7, 11, 5]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -139,49 +127,30 @@ class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
 
 
 class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing conversion of strided Conv2D (data_format=NCHW)."""
+
+  def GraphFn(self, inp):
+    np.random.seed(1234)
+    num_filters = 5
+    output = inp
+    output = conv2d_layer(
+        output,
+        num_filters, (3, 2),
+        strides=(2, 2),
+        padding="same",
+        data_format="channels_first")
+    output = conv2d_layer(
+        output,
+        num_filters, (3, 3),
+        strides=(2, 2),
+        dilation_rate=(2, 3),
+        padding="same",
+        data_format="channels_first")
+    return array_ops.identity(output, name="output_0")
 
   def GetParams(self):
-    """Testing conversion of strided Conv2D (data_format=NCHW) in TF-TRT
-
-    conversion.
-    """
-    np.random.seed(1234)
-    dtype = dtypes.float32
-    input_name = "input"
-    n, c, h, w = 13, 3, 7, 11
-    num_filters = 5
-    input_dims = [n, c, h, w]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-      with g.device("/GPU:0"):
-        output = inp
-        output = conv2d_layer(
-            output,
-            num_filters, (3, 2),
-            strides=(2, 2),
-            padding="same",
-            data_format="channels_first")
-        h = div_round_up(h, 2)
-        w = div_round_up(w, 2)
-        output = conv2d_layer(
-            output,
-            num_filters, (3, 3),
-            strides=(2, 2),
-            dilation_rate=(2, 3),
-            padding="same",
-            data_format="channels_first")
-        h = div_round_up(h, 2)
-        w = div_round_up(w, 2)
-        output = array_ops.identity(output, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[n, num_filters, h, w]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[13, 3, 7, 11]],
+                            [[13, 5, 2, 3]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -189,40 +158,29 @@ class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
 
 
 class Conv2DTranposeTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing conversion of conv2d_transpose (AKA Conv2DBackpropInput)"""
 
-  def GetParams(self):
-    """Testing conversion of conv2d_transpose (AKA Conv2DBackpropInput)"""
+  def GraphFn(self, inp):
     np.random.seed(1234)
-    dtype = dtypes.float32
-    input_name = "input"
+    dtype = inp.dtype
     n, c, h, w = 13, 3, 7, 11
     num_filters = 8
-    input_dims = [n, c, h, w]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-      with g.device("/GPU:0"):
-        weights_shape = [2, 2, num_filters, c]
-        weights = constant_op.constant(
-            np.random.randn(*weights_shape), dtype=dtype)
-        output_shape = constant_op.constant([n, num_filters, h * 2, w * 2],
-                                            dtype=dtypes.int32)
-        output = nn_ops.conv2d_transpose(
-            inp,
-            weights,
-            output_shape,
-            strides=[1, 1, 2, 2],
-            padding="SAME",
-            data_format="NCHW")
-        output = array_ops.identity(output, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[n, num_filters, h * 2, w * 2]]])
+    weights_shape = [2, 2, num_filters, c]
+    weights = constant_op.constant(np.random.randn(*weights_shape), dtype=dtype)
+    output_shape = constant_op.constant([n, num_filters, h * 2, w * 2],
+                                        dtype=dtypes.int32)
+    output = nn_ops.conv2d_transpose(
+        inp,
+        weights,
+        output_shape,
+        strides=[1, 1, 2, 2],
+        padding="SAME",
+        data_format="NCHW")
+    return array_ops.identity(output, name="output_0")
+
+  def GetParams(self):
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[13, 3, 7, 11]],
+                            [[13, 8, 14, 22]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
index a906071b2c7..6df0ab48844 100644
--- a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
@@ -31,6 +31,30 @@ from tensorflow.python.platform import test
 
 class DynamicInputShapesTest(trt_test.TfTrtIntegrationTestBase):
 
+  def GraphFn(self, x):
+    conv_filter1 = constant_op.constant(
+        np.ones([3, 3, 1, 8]), name="weights1", dtype=dtypes.float32)
+    bias1 = constant_op.constant(np.random.randn(8), dtype=dtypes.float32)
+    x = nn.conv2d(
+        input=x,
+        filter=conv_filter1,
+        strides=[1, 1, 1, 1],
+        padding="SAME",
+        name="conv")
+    x = nn.bias_add(x, bias1)
+    x = nn.relu(x)
+    conv_filter2 = constant_op.constant(
+        np.ones([3, 3, 8, 1]), name="weights2", dtype=dtypes.float32)
+    bias2 = constant_op.constant(np.random.randn(1), dtype=dtypes.float32)
+    x = nn.conv2d(
+        input=x,
+        filter=conv_filter2,
+        strides=[1, 1, 1, 1],
+        padding="SAME",
+        name="conv")
+    x = nn.bias_add(x, bias2)
+    return array_ops.identity(x, name="output")
+
   def GetParams(self):
     # TODO(laigd): we should test the following cases:
     # - batch size is not changed, other dims are changing
@@ -43,38 +67,17 @@ class DynamicInputShapesTest(trt_test.TfTrtIntegrationTestBase):
                   [[1, 224, 224, 1]], [[1, 128, 224, 1]]]
     expected_output_dims = input_dims
 
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(
-          shape=(None, None, None, 1), dtype=dtypes.float32, name="input")
-      conv_filter1 = constant_op.constant(
-          np.ones([3, 3, 1, 8]), name="weights1", dtype=dtypes.float32)
-      bias1 = constant_op.constant(np.random.randn(8), dtype=dtypes.float32)
-      x = nn.conv2d(
-          input=x,
-          filter=conv_filter1,
-          strides=[1, 1, 1, 1],
-          padding="SAME",
-          name="conv")
-      x = nn.bias_add(x, bias1)
-      x = nn.relu(x)
-      conv_filter2 = constant_op.constant(
-          np.ones([3, 3, 8, 1]), name="weights2", dtype=dtypes.float32)
-      bias2 = constant_op.constant(np.random.randn(1), dtype=dtypes.float32)
-      x = nn.conv2d(
-          input=x,
-          filter=conv_filter2,
-          strides=[1, 1, 1, 1],
-          padding="SAME",
-          name="conv")
-      x = nn.bias_add(x, bias2)
-      x = array_ops.identity(x, name="output")
-
     return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=["input"],
+        graph_fn=self.GraphFn,
+        input_specs=[
+            tensor_spec.TensorSpec([None, None, None, 1], dtypes.float32,
+                                   "input")
+        ],
+        output_specs=[
+            tensor_spec.TensorSpec([None, None, None, 1], dtypes.float32,
+                                   "output")
+        ],
         input_dims=input_dims,
-        output_names=["output"],
         expected_output_dims=expected_output_dims)
 
   def GetConversionParams(self, run_params):
diff --git a/tensorflow/python/compiler/tensorrt/test/identity_output_test.py b/tensorflow/python/compiler/tensorrt/test/identity_output_test.py
index 23a72c5b0b7..80aced35035 100644
--- a/tensorflow/python/compiler/tensorrt/test/identity_output_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/identity_output_test.py
@@ -28,42 +28,32 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class IdentityTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing engine with the same tensor repeated as output via identity."""
 
   def _ConstOp(self, shape):
     return constant_op.constant(np.random.randn(*shape), dtype=dtypes.float32)
 
+  def GraphFn(self, x):
+    b = self._ConstOp((32, 4))
+    x1 = math_ops.matmul(x, b)
+    b = self._ConstOp((1, 4))
+    x1 = x1 + b
+
+    out1 = array_ops.identity(x1, name='output_0')
+    out2 = array_ops.identity(x1, name='output_1')
+    iden1 = array_ops.identity(x1)
+    out3 = array_ops.identity(iden1, name='output_2')
+    return [out1, out2, out3]
+
   def GetParams(self):
-    """Testing engine with the same tensor repeated as output via identity."""
-    input_name = 'input'
-    input_dims = [100, 32]
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(
-          dtype=dtypes.float32, shape=input_dims, name=input_name)
-
-      b = self._ConstOp((32, 4))
-      x1 = math_ops.matmul(x, b)
-      b = self._ConstOp((1, 4))
-      x1 = x1 + b
-
-      out1 = array_ops.identity(x1, name='output1')
-      out2 = array_ops.identity(x1, name='output2')
-      iden1 = array_ops.identity(x1)
-      out3 = array_ops.identity(iden1, name='output3')
-
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=['output1', 'output2', 'output3'],
-        expected_output_dims=[[[100, 4]] * 3])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[100, 32]],
+                            [[100, 4]] * 3)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/int32_test.py b/tensorflow/python/compiler/tensorrt/test/int32_test.py
index 41a5a27addc..5653ff1f9be 100644
--- a/tensorflow/python/compiler/tensorrt/test/int32_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/int32_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -31,30 +30,21 @@ from tensorflow.python.platform import test
 
 
 class ExcludeUnsupportedInt32Test(trt_test.TfTrtIntegrationTestBase):
+  """Test exclusion of ops which are not supported in INT32 mode by TF-TRT"""
 
   def _ConstOp(self, shape, dtype):
     return constant_op.constant(np.random.randn(*shape), dtype=dtype)
 
+  def GraphFn(self, x):
+    dtype = x.dtype
+    b = self._ConstOp((4, 10), dtype)
+    x = math_ops.matmul(x, b)
+    b = self._ConstOp((10,), dtype)
+    x = nn.bias_add(x, b)
+    return array_ops.identity(x, name='output_0')
+
   def GetParams(self):
-    """Test exclusion of ops which are not supported in INT32 mode by TF-TRT"""
-    input_name = 'input'
-    output_name = 'output'
-    input_dims = [100, 4]
-    dtype = dtypes.int32
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      b = self._ConstOp((4, 10), dtype)
-      x = math_ops.matmul(x, b)
-      b = self._ConstOp((10,), dtype)
-      x = nn.bias_add(x, b)
-      x = array_ops.identity(x, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[100, 10]]])
+    return self.BuildParams(self.GraphFn, dtypes.int32, [[100, 4]], [[100, 10]])
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
@@ -71,12 +61,6 @@ class ExcludeUnsupportedInt32Test(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return []
 
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
-    # mode, which is a bug. Re-enable this when trt library is fixed.
-    return not trt_test.IsQuantizationMode(run_params.precision_mode)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
index 18e6d32dfe5..28240699a58 100644
--- a/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -32,37 +32,37 @@ from tensorflow.python.platform import test
 
 class LRUCacheTest(trt_test.TfTrtIntegrationTestBase):
 
+  def GraphFn(self, x):
+    conv_filter = constant_op.constant(
+        np.random.randn(3, 3, 2, 1), dtype=dtypes.float32)
+    x = nn.conv2d(
+        input=x,
+        filter=conv_filter,
+        strides=[1, 1, 1, 1],
+        padding="SAME",
+        name="conv")
+    bias = constant_op.constant(
+        np.random.randn(1, 10, 10, 1), dtype=dtypes.float32)
+    x = math_ops.add(x, bias)
+    x = nn.relu(x)
+    return array_ops.identity(x, name="output")
+
   def GetParams(self):
     dtype = dtypes.float32
-    input_name = "input"
     input_dims = [[[1, 10, 10, 2]], [[2, 10, 10, 2]], [[4, 10, 10, 2]],
                   [[2, 10, 10, 2]]]
     expected_output_dims = [[[1, 10, 10, 1]], [[2, 10, 10, 1]], [[4, 10, 10,
                                                                   1]],
                             [[2, 10, 10, 1]]]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(
-          dtype=dtype, shape=[None, 10, 10, 2], name=input_name)
-      conv_filter = constant_op.constant(
-          np.random.randn(3, 3, 2, 1), dtype=dtypes.float32)
-      x = nn.conv2d(
-          input=x,
-          filter=conv_filter,
-          strides=[1, 1, 1, 1],
-          padding="SAME",
-          name="conv")
-      bias = constant_op.constant(
-          np.random.randn(1, 10, 10, 1), dtype=dtypes.float32)
-      x = math_ops.add(x, bias)
-      x = nn.relu(x)
-      x = array_ops.identity(x, name="output")
     return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
+        graph_fn=self.GraphFn,
+        input_specs=[
+            tensor_spec.TensorSpec([None, 10, 10, 2], dtypes.float32, "input")
+        ],
+        output_specs=[
+            tensor_spec.TensorSpec([None, 10, 10, 1], dtypes.float32, "output")
+        ],
         input_dims=input_dims,
-        output_names=[output_name],
         expected_output_dims=expected_output_dims)
 
   def ExpectedEnginesToBuild(self, run_params):
diff --git a/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
index 89625aa629b..056edc3e4d4 100644
--- a/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
@@ -23,48 +23,37 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
 
 class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing conversion of BatchMatMul in TF-TRT conversion."""
+
+  def GraphFn(self, inp):
+    dtype = inp.dtype
+    e1 = constant_op.constant(
+        np.random.randn(1, 1, 3, 5), name="kernel_1", dtype=dtype)
+    e2 = constant_op.constant(
+        np.random.randn(1, 1, 5, 10), name="kernel_2", dtype=dtype)
+    conv = nn.conv2d(
+        input=inp,
+        filter=e1,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        name="conv")
+    out = nn.conv2d(
+        input=conv,
+        filter=e2,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        name="conv_2")
+    return array_ops.squeeze(out, name="output_0")
 
   def GetParams(self):
-    """Testing conversion of BatchMatMul in TF-TRT conversion."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [2, 15, 15, 3]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-      with g.device("/GPU:0"):
-        e1 = constant_op.constant(
-            np.random.randn(1, 1, 3, 5), name="kernel_1", dtype=dtype)
-        e2 = constant_op.constant(
-            np.random.randn(1, 1, 5, 10), name="kernel_2", dtype=dtype)
-        conv = nn.conv2d(
-            input=inp,
-            filter=e1,
-            strides=[1, 1, 1, 1],
-            padding="VALID",
-            name="conv")
-        out = nn.conv2d(
-            input=conv,
-            filter=e2,
-            strides=[1, 1, 1, 1],
-            padding="VALID",
-            name="conv_2")
-      array_ops.squeeze(out, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[2, 15, 15, 10]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[2, 15, 15, 3]],
+                            [[2, 15, 15, 10]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
index d04c6958fbc..b57bee6c5d7 100644
--- a/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -23,61 +23,50 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
 
 class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
+  """Test for multi connection neighboring nodes wiring tests in TF-TRT."""
+
+  def GraphFn(self, x):
+    dtype = x.dtype
+    e = constant_op.constant(
+        np.random.normal(.05, .005, [3, 2, 3, 4]), name="weights", dtype=dtype)
+    conv = nn.conv2d(
+        input=x,
+        filter=e,
+        data_format="NCHW",
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        name="conv")
+    b = constant_op.constant(
+        np.random.normal(2.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+    t = conv + b
+
+    b = constant_op.constant(
+        np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+    q = conv - b
+    edge = self.trt_incompatible_op(q)
+
+    b = constant_op.constant(
+        np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+    d = b + conv
+    edge3 = self.trt_incompatible_op(d)
+
+    edge1 = self.trt_incompatible_op(conv)
+    t = t - edge1
+    q = q + edge
+    t = t + q
+    t = t + d
+    t = t - edge3
+    return array_ops.squeeze(t, name="output_0")
 
   def GetParams(self):
-    """Test for multi connection neighboring nodes wiring tests in TF-TRT."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [2, 3, 7, 5]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      e = constant_op.constant(
-          np.random.normal(.05, .005, [3, 2, 3, 4]),
-          name="weights",
-          dtype=dtype)
-      conv = nn.conv2d(
-          input=x,
-          filter=e,
-          data_format="NCHW",
-          strides=[1, 1, 1, 1],
-          padding="VALID",
-          name="conv")
-      b = constant_op.constant(
-          np.random.normal(2.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
-      t = conv + b
-
-      b = constant_op.constant(
-          np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
-      q = conv - b
-      edge = self.trt_incompatible_op(q)
-
-      b = constant_op.constant(
-          np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
-      d = b + conv
-      edge3 = self.trt_incompatible_op(d)
-
-      edge1 = self.trt_incompatible_op(conv)
-      t = t - edge1
-      q = q + edge
-      t = t + q
-      t = t + d
-      t = t - edge3
-      array_ops.squeeze(t, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[2, 4, 5, 4]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[2, 3, 7, 5]],
+                            [[2, 4, 5, 4]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
index 1f7189f0eb2..f377fe8dceb 100644
--- a/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -31,37 +30,29 @@ from tensorflow.python.platform import test
 
 
 class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
+  """Neighboring node wiring tests in TF-TRT conversion."""
+
+  def GraphFn(self, x):
+    dtype = x.dtype
+    e = constant_op.constant(
+        np.random.normal(.3, 0.05, [3, 2, 3, 4]), name="weights", dtype=dtype)
+    conv = nn.conv2d(
+        input=x,
+        filter=e,
+        data_format="NCHW",
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        name="conv")
+    b = constant_op.constant(
+        np.random.normal(1.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+    t = math_ops.mul(conv, b, name="mul")
+    e = self.trt_incompatible_op(conv, name="incompatible")
+    t = math_ops.sub(t, e, name="sub")
+    return array_ops.squeeze(t, name="output_0")
 
   def GetParams(self):
-    """Neighboring node wiring tests in TF-TRT conversion."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [2, 3, 7, 5]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      e = constant_op.constant(
-          np.random.normal(.3, 0.05, [3, 2, 3, 4]), name="weights", dtype=dtype)
-      conv = nn.conv2d(
-          input=x,
-          filter=e,
-          data_format="NCHW",
-          strides=[1, 1, 1, 1],
-          padding="VALID",
-          name="conv")
-      b = constant_op.constant(
-          np.random.normal(1.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
-      t = math_ops.mul(conv, b, name="mul")
-      e = self.trt_incompatible_op(conv, name="incompatible")
-      t = math_ops.sub(t, e, name="sub")
-      array_ops.squeeze(t, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[2, 4, 5, 4]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[2, 3, 7, 5]],
+                            [[2, 4, 5, 4]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
index 88bc76bc53c..e3a6bede927 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
@@ -24,59 +24,51 @@ from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_ve
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-def _GetParams(add_quantization_nodes, dtype=dtypes.float32):
-  input_name = "input"
-  input_dims = [8, 8]
-  output_name = "output"
+def _GraphFn(x, add_quantization_nodes):
 
   def _Quantize(x, r):
     if add_quantization_nodes:
       x = gen_array_ops.fake_quant_with_min_max_vars(x, -r, r)
     return x
 
-  g = ops.Graph()
-  with g.as_default():
-    x = array_ops.placeholder(
-        dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-    x = _Quantize(x, 10.0)
-    x = x + 5
-    x = _Quantize(x, 15.0)
-    x = x - 5
-    x = _Quantize(x, 10.0)
-    x = x * 0.1
-    x = _Quantize(x, 1.0)
-    w = constant_op.constant(np.ones((8, 1)), dtype=dtypes.float32)
-    x = math_ops.matmul(x, w)
-    x = _Quantize(x, 10.0)
-    x = array_ops.identity(x, name=output_name)
+  x = _Quantize(x, 10.0)
+  x = x + 5
+  x = _Quantize(x, 15.0)
+  x = x - 5
+  x = _Quantize(x, 10.0)
+  x = x * 0.1
+  x = _Quantize(x, 1.0)
+  w = constant_op.constant(np.ones((8, 1)), dtype=dtypes.float32)
+  x = math_ops.matmul(x, w)
+  x = _Quantize(x, 10.0)
+  return array_ops.identity(x, name="output_0")
 
-  return trt_test.TfTrtIntegrationTestParams(
-      gdef=g.as_graph_def(),
-      input_names=[input_name],
-      input_dims=[[input_dims]],
-      output_names=[output_name],
-      expected_output_dims=[[[8, 1]]])
+
+def _GetParams(self):
+  return self.BuildParams(self.GraphFn, dtypes.float32, [[8, 8]], [[8, 1]])
 
 
 class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
+  """Create a graph containing single segment with no quantization ranges."""
+
+  def GraphFn(self, x):
+    return _GraphFn(x, add_quantization_nodes=False)
 
   def GetParams(self):
-    """Create a graph containing single segment with no quantization ranges."""
-    return _GetParams(add_quantization_nodes=False)
+    return _GetParams(self)
 
   def ShouldRunTest(self, run_params):
     if get_linked_tensorrt_version()[0] < 5:
       return False
     # Only test static engine mode, with or without calibration.
     return (trt_test.IsQuantizationMode(run_params.precision_mode) and
-            not run_params.use_optimizer and not run_params.dynamic_engine)
+            not run_params.convert_online and not run_params.dynamic_engine)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -90,17 +82,20 @@ class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
 
 
 class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
+  """Create a graph containing single segment with no quantization ranges."""
+
+  def GraphFn(self, x):
+    return _GraphFn(x, add_quantization_nodes=True)
 
   def GetParams(self):
-    """Create a graph containing single segment with no quantization ranges."""
-    return _GetParams(add_quantization_nodes=True)
+    return _GetParams(self)
 
   def ShouldRunTest(self, run_params):
     if get_linked_tensorrt_version()[0] < 5:
       return False
     # Test static/dynamic engine with/without calibration.
     return (trt_test.IsQuantizationMode(run_params.precision_mode) and
-            not run_params.use_optimizer)
+            not run_params.convert_online)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -116,10 +111,13 @@ class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
 
 
 class NonQuantizedPrecisionsWithRangesTest(trt_test.TfTrtIntegrationTestBase):
+  """Create a graph containing single segment with no quantization ranges."""
+
+  def GraphFn(self, x):
+    return _GraphFn(x, add_quantization_nodes=True)
 
   def GetParams(self):
-    """Create a graph containing single segment with no quantization ranges."""
-    return _GetParams(add_quantization_nodes=True)
+    return _GetParams(self)
 
   def ShouldRunTest(self, run_params):
     # Only test FP32/FP16 mode.
diff --git a/tensorflow/python/compiler/tensorrt/test/rank_two_test.py b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
index a951638b505..b23e052a316 100644
--- a/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -29,43 +28,36 @@ from tensorflow.python.platform import test
 
 
 class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
+  """Test for rank 2 input in TF-TRT."""
+
+  def GraphFn(self, x1, x2):
+    # Two paths: first with rank 2 input, second with rank 4 input.
+    outputs = []
+    xs = [x1, x2]
+    for i in range(2):
+      x = xs[i]
+      c = constant_op.constant(1.0, name="c%d_1" % i)
+      q = math_ops.add(x, c, name="add%d_1" % i)
+      q = math_ops.abs(q, name="abs%d_1" % i)
+      c = constant_op.constant(2.2, name="c%d_2" % i)
+      q = math_ops.add(q, c, name="add%d_2" % i)
+      q = math_ops.abs(q, name="abs%d_2" % i)
+      c = constant_op.constant(3.0, name="c%d_3" % i)
+      q = math_ops.add(q, c, name="add%d_3" % i)
+      if i == 0:
+        axis = constant_op.constant(-1, dtype=dtypes.int32, name="axis")
+        for j in range(2):
+          q = array_ops.expand_dims(q, axis, name="expand%d_%d" % (i, j))
+        q = self.trt_incompatible_op(q)
+      q = gen_math_ops.reciprocal(q, name="reciprocal%d" % i)
+      outputs.append(q)
+    # Combine both paths
+    q = math_ops.add(outputs[0], outputs[1], name="add")
+    return array_ops.squeeze(q, name="output_0")
 
   def GetParams(self):
-    """Test for rank 2 input in TF-TRT."""
-    input_names = ["input", "input2"]
-    # Two paths: first with rank 2 input, second with rank 4 input.
-    input_dims = [[12, 5], [12, 5, 2, 2]]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      outputs = []
-      for i in range(2):
-        x = array_ops.placeholder(
-            dtype=dtypes.float32, shape=input_dims[i], name=input_names[i])
-        c = constant_op.constant(1.0, name="c%d_1" % i)
-        q = math_ops.add(x, c, name="add%d_1" % i)
-        q = math_ops.abs(q, name="abs%d_1" % i)
-        c = constant_op.constant(2.2, name="c%d_2" % i)
-        q = math_ops.add(q, c, name="add%d_2" % i)
-        q = math_ops.abs(q, name="abs%d_2" % i)
-        c = constant_op.constant(3.0, name="c%d_3" % i)
-        q = math_ops.add(q, c, name="add%d_3" % i)
-        if i == 0:
-          axis = constant_op.constant(-1, dtype=dtypes.int32, name="axis")
-          for j in range(2):
-            q = array_ops.expand_dims(q, axis, name="expand%d_%d" % (i, j))
-          q = self.trt_incompatible_op(q)
-        q = gen_math_ops.reciprocal(q, name="reciprocal%d" % i)
-        outputs.append(q)
-      # Combine both paths
-      q = math_ops.add(outputs[0], outputs[1], name="add")
-      array_ops.squeeze(q, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=input_names,
-        input_dims=[input_dims],
-        output_names=[output_name],
-        expected_output_dims=[[input_dims[1]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32,
+                            [[12, 5], [12, 5, 2, 2]], [[12, 5, 2, 2]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py b/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
index 423d70f2e4e..b785e28ed8f 100644
--- a/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -28,53 +27,41 @@ from tensorflow.python.platform import test
 
 class ReshapeTest(trt_test.TfTrtIntegrationTestBase):
 
+  def GraphFn(self, inp):
+    outputs = []
+    # Here we test two types of reshapes, one changes the batch dimension and
+    # the other does not. Note that we're not able to test reshaping to
+    # scalar, since TRT requires input tensor to be of rank at least 2, so a
+    # reshape with scalar input will be filtered out of the segment before
+    # conversion.
+    #
+    # These reshapes happen at batch dimension, thus conversion should fail.
+    for shape in [[2, 50, 24, 24, 2], [-1, 50, 24, 24, 2], [2, 50, -1, 24, 2]]:
+      incompatible_reshape = array_ops.reshape(inp, shape)
+      reshape_back = array_ops.reshape(incompatible_reshape, [-1, 24, 24, 2])
+      outputs.append(self.trt_incompatible_op(reshape_back))
+    # Add another block with many reshapes that don't change the batch
+    # dimension.
+    compatible_reshape = array_ops.reshape(
+        inp, [-1, 24 * 24, 2], name="reshape-0")
+    compatible_reshape = array_ops.reshape(
+        compatible_reshape, [100, 24, -1], name="reshape-1")
+    compatible_reshape = array_ops.reshape(
+        compatible_reshape, [100, 24 * 2, 24], name="reshape-2")
+    compatible_reshape = array_ops.reshape(
+        compatible_reshape, [-1, 24, 24 * 2], name="reshape-3")
+    compatible_reshape = array_ops.reshape(
+        compatible_reshape, [-1, 6, 4, 24, 2], name="reshape-4")
+    compatible_reshape = array_ops.reshape(
+        compatible_reshape, [-1, 6, 4, 6, 4, 2, 1], name="reshape-5")
+    compatible_reshape = array_ops.reshape(
+        compatible_reshape, [-1, 24, 24, 2], name="reshape-6")
+    outputs.append(self.trt_incompatible_op(compatible_reshape))
+    return math_ops.add_n(outputs, name="output_0")
+
   def GetParams(self):
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [100, 24, 24, 2]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-      outputs = []
-      # Here we test two types of reshapes, one changes the batch dimension and
-      # the other does not. Note that we're not able to test reshaping to
-      # scalar, since TRT requires input tensor to be of rank at least 2, so a
-      # reshape with scalar input will be filtered out of the segment before
-      # conversion.
-      with g.device("/GPU:0"):
-        # These reshapes happen at batch dimension, thus conversion should fail.
-        for shape in [[2, 50, 24, 24, 2], [-1, 50, 24, 24, 2],
-                      [2, 50, -1, 24, 2]]:
-          incompatible_reshape = array_ops.reshape(inp, shape)
-          reshape_back = array_ops.reshape(incompatible_reshape,
-                                           [-1, 24, 24, 2])
-          outputs.append(self.trt_incompatible_op(reshape_back))
-        # Add another block with many reshapes that don't change the batch
-        # dimension.
-        compatible_reshape = array_ops.reshape(
-            inp, [-1, 24 * 24, 2], name="reshape-0")
-        compatible_reshape = array_ops.reshape(
-            compatible_reshape, [100, 24, -1], name="reshape-1")
-        compatible_reshape = array_ops.reshape(
-            compatible_reshape, [100, 24 * 2, 24], name="reshape-2")
-        compatible_reshape = array_ops.reshape(
-            compatible_reshape, [-1, 24, 24 * 2], name="reshape-3")
-        compatible_reshape = array_ops.reshape(
-            compatible_reshape, [-1, 6, 4, 24, 2], name="reshape-4")
-        compatible_reshape = array_ops.reshape(
-            compatible_reshape, [-1, 6, 4, 6, 4, 2, 1], name="reshape-5")
-        compatible_reshape = array_ops.reshape(
-            compatible_reshape, [-1, 24, 24, 2], name="reshape-6")
-        outputs.append(self.trt_incompatible_op(compatible_reshape))
-      math_ops.add_n(outputs, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[input_dims]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[100, 24, 24, 2]],
+                            [[100, 24, 24, 2]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -91,47 +78,35 @@ class ReshapeTest(trt_test.TfTrtIntegrationTestBase):
 
 class TransposeTest(trt_test.TfTrtIntegrationTestBase):
 
+  def GraphFn(self, inp):
+    # Add a block with compatible transposes.
+    compatible_transpose = array_ops.transpose(
+        inp, [0, 3, 1, 2], name="transpose-1")
+    compatible_transpose = array_ops.transpose(
+        compatible_transpose, [0, 2, 3, 1], name="transposeback")
+
+    # Add an incompatible op so the first block will not be in the same
+    # subgraph where the following block belongs.
+    bridge = self.trt_incompatible_op(compatible_transpose)
+
+    # Add a block with incompatible transposes.
+    #
+    # Note: by default Grappler will run the TRT optimizer twice. At the
+    # first time it will group the two transpose ops below to same segment
+    # then fail the conversion due to the expected batch dimension problem.
+    # At the second time, since the input of bridge op is TRTEngineOp_0, it
+    # will fail to do shape inference which then cause conversion to fail.
+    # TODO(laigd): support shape inference, make TRT optimizer run only
+    # once, and fix this.
+    incompatible_transpose = array_ops.transpose(
+        bridge, [2, 1, 0, 3], name="transpose-2")
+    excluded_transpose = array_ops.transpose(
+        incompatible_transpose, [0, 2, 3, 1], name="transpose-3")
+    return array_ops.identity(excluded_transpose, name="output_0")
+
   def GetParams(self):
-    """Create a graph containing single segment."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [100, 24, 24, 2]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
-      with g.device("/GPU:0"):
-        # Add a block with compatible transposes.
-        compatible_transpose = array_ops.transpose(
-            inp, [0, 3, 1, 2], name="transpose-1")
-        compatible_transpose = array_ops.transpose(
-            compatible_transpose, [0, 2, 3, 1], name="transposeback")
-
-        # Add an incompatible op so the first block will not be in the same
-        # subgraph where the following block belongs.
-        bridge = self.trt_incompatible_op(compatible_transpose)
-
-        # Add a block with incompatible transposes.
-        #
-        # Note: by default Grappler will run the TRT optimizer twice. At the
-        # first time it will group the two transpose ops below to same segment
-        # then fail the conversion due to the expected batch dimension problem.
-        # At the second time, since the input of bridge op is TRTEngineOp_0, it
-        # will fail to do shape inference which then cause conversion to fail.
-        # TODO(laigd): support shape inference, make TRT optimizer run only
-        # once, and fix this.
-        incompatible_transpose = array_ops.transpose(
-            bridge, [2, 1, 0, 3], name="transpose-2")
-        excluded_transpose = array_ops.transpose(
-            incompatible_transpose, [0, 2, 3, 1], name="transpose-3")
-      array_ops.identity(excluded_transpose, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[24, 100, 2, 24]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[100, 24, 24, 2]],
+                            [[24, 100, 2, 24]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index f499d83189c..1cc381a3449 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -21,42 +21,64 @@ from __future__ import print_function
 from collections import namedtuple
 import itertools
 import os
+import tempfile
 import warnings
 import numpy as np
 import six
 
 from tensorflow.compiler.tf2tensorrt.wrap_py_utils import is_tensorrt_enabled
+from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.compiler.tensorrt import trt_convert
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
-from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import utils
+from tensorflow.python.tools import saved_model_utils
 
 TfTrtIntegrationTestParams = namedtuple(
     "TfTrtIntegrationTestParams",
     [
-        "gdef",
-        # A list of names of the input placeholder nodes.
-        "input_names",
-        # A list of list of output shapes of the input placeholder nodes.
+        # A function that creates the TF graph for testing.
+        "graph_fn",
+        # A list of specifications for input tensors.
+        "input_specs",
+        # A list of specifications for output tensors.
+        "output_specs",
+        # A list of list of input shapes. Each shape must match the
+        # corresponding element in `input_specs`.
         "input_dims",
-        # A list of names of the output identity nodes.
-        "output_names",
-        # A list of list of expected output shapes of the output identity nodes.
+        # A list of list of expected output shapes. Each shape must match the
+        # corresponding element in `output_specs`.
         "expected_output_dims"
     ])
 
-RunParams = namedtuple("RunParams", [
-    "use_optimizer", "precision_mode", "dynamic_engine", "test_name",
-    "use_calibration"
-])
+RunParams = namedtuple(
+    "RunParams",
+    [
+        # Whether to run the conversion online with RewriterConfig, or offline
+        # with TrtGraphConverter.
+        "convert_online",
+        "precision_mode",
+        "dynamic_engine",
+        "use_calibration",
+        "test_name",
+    ])
 
-PRECISION_MODES = ["FP32", "FP16", "INT8"]
+FP32 = "FP32"
+FP16 = "FP16"
+INT8 = "INT8"
+PRECISION_MODES = [FP32, FP16, INT8]
 
 
 def IsQuantizationMode(mode):
@@ -152,6 +174,32 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     super(TfTrtIntegrationTestBase, self).setUp()
     warnings.simplefilter("always")
 
+  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes):
+    """Build test parameters when not considering dynamic shapes."""
+
+    def _Validate(shapes):
+      # Make sure all the shapes are fully specified.
+      for shape in shapes:
+        assert all(shape)
+
+    _Validate(input_shapes)
+    _Validate(output_shapes)
+
+    return TfTrtIntegrationTestParams(
+        graph_fn=graph_fn,
+        # Unset the batch dim of the specs to make sure TRT can tolerate changes
+        # on that.
+        input_specs=[
+            tensor_spec.TensorSpec([None] + shape[1:], dtype, "input_%d" % i)
+            for i, shape in enumerate(input_shapes)
+        ],
+        output_specs=[
+            tensor_spec.TensorSpec([None] + shape[1:], dtype, "output_%d" % i)
+            for i, shape in enumerate(output_shapes)
+        ],
+        input_dims=[input_shapes],
+        expected_output_dims=[output_shapes])
+
   def GetParams(self):
     """Return a TfTrtIntegrationTestParams for test, implemented by subclass."""
     raise NotImplementedError()
@@ -186,8 +234,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     """Whether to run the test."""
     # This setting combination requires quantization nodes to be present in
     # order to build the engine.
-    return not (IsQuantizationMode(run_params.precision_mode) and
-                not run_params.use_calibration)
+    return (run_params.use_calibration or
+            not IsQuantizationMode(run_params.precision_mode))
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build, implemented by subclass."""
@@ -214,7 +262,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
     conversion_params = self.GetConversionParams(run_params)
-    if graph_state == GraphState.INFERENCE and run_params.use_optimizer:
+    if graph_state == GraphState.INFERENCE and run_params.convert_online:
       rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(conversion_params)
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
@@ -230,23 +278,19 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _GetFeedNames(self):
     params = self._GetParamsCached()
     # Construct the feeds tensor names by appending :0 to the node names.
-    return [input_name + ":0" for input_name in params.input_names]
+    return [spec.name + ":0" for spec in params.input_specs]
 
   def _GetFetchNames(self):
     params = self._GetParamsCached()
     # Construct the fetches tensor names by appending :0 to the node names.
-    return [output_name + ":0" for output_name in params.output_names]
+    return [spec.name + ":0" for spec in params.output_specs]
 
-  def _GetFeedDict(self, inputs_data, input_shape_index):
-    assert input_shape_index < len(inputs_data)
-    feeds = self._GetFeedNames()
-    return {
-        feeds[i]: inputs_data[input_shape_index][i] for i in range(len(feeds))
-    }
+  def _GetFeedDict(self, inputs_data):
+    return {name: data for name, data in zip(self._GetFeedNames(), inputs_data)}
 
   def _RunGraph(self,
                 run_params,
-                gdef,
+                saved_model_dir,
                 inputs_data,
                 config,
                 graph_state,
@@ -254,39 +298,34 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     """Run given graphdef multiple times."""
     params = self._GetParamsCached()
     for data in inputs_data:
-      assert len(params.input_names) == len(data)
+      assert len(params.input_specs) == len(data)
 
     fetches = self._GetFetchNames()
     g = ops.Graph()
     with g.as_default():
-      importer.import_graph_def(graph_def=gdef, name="")
-      with self.session(
-          graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
+      with self.session(graph=g, config=config, use_gpu=True) as sess:
+        loader.load(sess, [tag_constants.SERVING], saved_model_dir)
         vals = []
         # Run for each input(s) shape
-        for shape_index in range(len(inputs_data)):
+        for expected_shapes, current_input_data in zip(
+            params.expected_output_dims, inputs_data):
           val = None
           for _ in range(num_runs):
-            new_val = sess.run(fetches,
-                               self._GetFeedDict(inputs_data, shape_index))
-            output_len = len(params.expected_output_dims[shape_index])
-            self.assertEqual(output_len, len(new_val))
-            for i in range(output_len):
-              self.assertEqual(
-                  list(params.expected_output_dims[shape_index][i]),
-                  list(new_val[i].shape))
+            new_val = sess.run(fetches, self._GetFeedDict(current_input_data))
+            self.assertEqual(len(expected_shapes), len(new_val))
+            for expected_shape, actual_val in zip(expected_shapes, new_val):
+              self.assertEqual(list(expected_shape), list(actual_val.shape))
             if val is not None:
               self.assertAllClose(val, new_val, atol=1.e-06, rtol=1.e-06)
             val = new_val
           vals.append(val)
         return vals
 
-  def _CreateConverter(self, gdef, session_config, conversion_params):
+  def _CreateConverter(self, saved_model_dir, session_config,
+                       conversion_params):
     """Return a TrtGraphConverter."""
-    params = self._GetParamsCached()
     converter = trt_convert.TrtGraphConverter(
-        input_graph_def=gdef,
-        nodes_blacklist=params.input_names + params.output_names,
+        input_saved_model_dir=saved_model_dir,
         session_config=session_config,
         max_batch_size=conversion_params.max_batch_size,
         max_workspace_size_bytes=conversion_params.max_workspace_size_bytes,
@@ -299,7 +338,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         use_function_backup=conversion_params.use_function_backup)
     return converter
 
-  def _GetCalibratedInferGraph(self, run_params, gdef, inputs_data):
+  def _GetCalibratedInferGraph(self, run_params, saved_model_dir, inputs_data):
     """Return trt converted graphdef in INT8 mode."""
     conversion_params = self.GetConversionParams(run_params)
     logging.info(conversion_params)
@@ -308,21 +347,29 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     assert conversion_params.maximum_cached_engines == 1
     assert not conversion_params.cached_engine_batches
     assert conversion_params.use_calibration
-    assert len(inputs_data) == 1  # We only support calibrating single engine.
+
+    # We only support calibrating single engine.
+    # TODO(aaroey): fix this.
+    assert len(inputs_data) == 1
 
     session_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
     logging.info("Running calibration graph, config:\n%s", str(session_config))
 
-    converter = self._CreateConverter(gdef, session_config, conversion_params)
+    converter = self._CreateConverter(saved_model_dir, session_config,
+                                      conversion_params)
     int8_gdef = converter.convert()
-    self._VerifyGraphDef(run_params, int8_gdef, GraphState.CALIBRATE)
+    self._VerifyGraphDef(run_params, saved_model_dir, int8_gdef,
+                         GraphState.CALIBRATE)
 
-    return converter.calibrate(
+    converter.calibrate(
         fetch_names=self._GetFetchNames(),
         num_runs=5,
-        feed_dict_fn=lambda: self._GetFeedDict(inputs_data, 0))
+        feed_dict_fn=lambda: self._GetFeedDict(inputs_data[0]))
+    trt_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    converter.save(trt_saved_model_dir)
+    return trt_saved_model_dir
 
-  def _GetInferGraph(self, run_params, gdef):
+  def _GetInferGraph(self, run_params, saved_model_dir):
     """Return trt converted graphdef."""
     conversion_params = self.GetConversionParams(run_params)
     logging.info(conversion_params)
@@ -330,8 +377,12 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     session_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
     logging.info("Creating TRT graph for inference, config\n%s",
                  str(session_config))
-    converter = self._CreateConverter(gdef, session_config, conversion_params)
-    return converter.convert()
+    converter = self._CreateConverter(saved_model_dir, session_config,
+                                      conversion_params)
+    converter.convert()
+    trt_saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    converter.save(trt_saved_model_dir)
+    return trt_saved_model_dir
 
   def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
@@ -348,17 +399,16 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       logging.info("Writing graph to %s/%s", temp_dir, graph_name)
       graph_io.write_graph(gdef, temp_dir, graph_name)
 
-  def _VerifyConnections(self, expected_engines, converted_gdef):
-    params = self._GetParamsCached()
+  def _VerifyConnections(self, expected_engines, original_gdef, converted_gdef):
     old_to_new_node_map = {
         self._ToString(node.name): self._ToString(node.name)
-        for node in params.gdef.node
+        for node in original_gdef.node
     }
     for engine_name, node_names in expected_engines.items():
       for node_name in node_names:
         old_to_new_node_map[node_name] = engine_name
     name_to_node_map = {
-        self._ToString(node.name): node for node in params.gdef.node
+        self._ToString(node.name): node for node in original_gdef.node
     }
 
     def _InputName(inp):
@@ -372,8 +422,9 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         inp = inp[:-len(parts[-1]) - 1]
       return (prefix, inp)
 
+    # Compute the expected mapping from each node to its input nodes.
     expected_input_map = {}
-    for node in params.gdef.node:
+    for node in original_gdef.node:
       name_str = self._ToString(node.name)
       target_node_name = old_to_new_node_map[name_str]
       is_engine_op = (target_node_name != name_str)
@@ -394,6 +445,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
           else:
             input_set.add(prefix + old_to_new_node_map[inp_name])
 
+    # Compute the actual mapping from each node to its input nodes.
     actual_input_map = {}
     for node in converted_gdef.node:
       name_str = self._ToString(node.name)
@@ -409,13 +461,23 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         msg="expected:\n%s\nvs actual:\n%s" %
         (sorted(expected_input_map.items()), sorted(actual_input_map.items())))
 
-  def _VerifyGraphDef(self, run_params, gdef, graph_state):
-    self._WriteGraph(run_params, gdef, graph_state)
+  def _GetGraphDef(self, gdef_or_saved_model_dir):
+    if isinstance(gdef_or_saved_model_dir, str):
+      return saved_model_utils.get_meta_graph_def(
+          gdef_or_saved_model_dir, tag_constants.SERVING).graph_def
+    assert isinstance(gdef_or_saved_model_dir, graph_pb2.GraphDef)
+    return gdef_or_saved_model_dir
+
+  def _VerifyGraphDef(self, run_params, original_gdef_or_saved_model_dir,
+                      gdef_or_saved_model_dir_to_verify, graph_state):
+    original_gdef = self._GetGraphDef(original_gdef_or_saved_model_dir)
+    gdef_to_verify = self._GetGraphDef(gdef_or_saved_model_dir_to_verify)
+    self._WriteGraph(run_params, gdef_to_verify, graph_state)
 
     expected_engines = self.ExpectedEnginesToBuild(run_params)
     num_engines = 0
-    functions = [f.signature.name for f in gdef.library.function]
-    for node in gdef.node:
+    functions = [f.signature.name for f in gdef_to_verify.library.function]
+    for node in gdef_to_verify.node:
       if node.op == "TRTEngineOp":
         logging.info("Found TRTEngineOp: " + node.name)
         num_engines += 1
@@ -440,8 +502,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
                          run_params.use_calibration, node.name)
 
         has_calibration_data = len(node.attr["calibration_data"].s)
-        if (IsQuantizationMode(run_params.precision_mode) and
-            run_params.use_calibration and graph_state == GraphState.INFERENCE):
+        if (IsQuantizationWithCalibration(run_params) and
+            graph_state == GraphState.INFERENCE):
           self.assertTrue(has_calibration_data, node.name)
         else:
           self.assertFalse(has_calibration_data, node.name)
@@ -450,71 +512,100 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     else:
       self.assertEqual(num_engines, len(expected_engines))
       if isinstance(expected_engines, dict):
-        self._VerifyConnections(expected_engines, gdef)
+        self._VerifyConnections(expected_engines, original_gdef, gdef_to_verify)
       # TODO(aaroey): consider verifying the corresponding TF function.
 
+  def _MakeSavedModel(self, run_params):
+    """Write the saved model as an input for testing."""
+    params = self._GetParamsCached()
+    g = ops.Graph()
+    with g.as_default():
+      inputs = []
+      for spec in params.input_specs:
+        inp = array_ops.placeholder(
+            dtype=spec.dtype, shape=spec.shape, name=spec.name)
+        inputs.append(inp)
+      outputs = params.graph_fn(*inputs)
+      if not isinstance(outputs, list) and not isinstance(outputs, tuple):
+        outputs = [outputs]
+      for spec, output in zip(params.output_specs, outputs):
+        assert spec.name == output.name.split(":")[0]
+
+    signature_def = signature_def_utils.build_signature_def(
+        inputs={inp.op.name: utils.build_tensor_info(inp) for inp in inputs},
+        outputs={out.op.name: utils.build_tensor_info(out) for out in outputs},
+        method_name=signature_constants.PREDICT_METHOD_NAME)
+
+    saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    saved_model_builder = builder.SavedModelBuilder(saved_model_dir)
+    with self.session(
+        graph=g, config=self._GetConfigProto(run_params,
+                                             GraphState.ORIGINAL)) as sess:
+      saved_model_builder.add_meta_graph_and_variables(
+          sess, [tag_constants.SERVING],
+          signature_def_map={
+              signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+                  signature_def
+          })
+    saved_model_builder.save()
+    return saved_model_dir
+
   def RunTest(self, run_params):
     if not self.ShouldRunTest(run_params):
       return
-    assert run_params.precision_mode in PRECISION_MODES
-    np.random.seed(12345)
 
-    params = self._GetParamsCached()
-    input_gdef = params.gdef
-    input_dtypes = {}
-    for node in input_gdef.node:
-      if self._ToString(node.name) in params.input_names:
-        assert self._ToString(node.op) == "Placeholder"
-        input_dtypes[self._ToString(node.name)] = (
-            dtypes.as_dtype(node.attr["dtype"].type).as_numpy_dtype())
-    assert len(params.input_names) == len(input_dtypes)
+    saved_model_dir = self._MakeSavedModel(run_params)
 
+    np.random.seed(12345)  # Fix the seed so the test is deterministic.
     inputs_data = []
-    for inp in params.input_dims:
+    input_specs = self._GetParamsCached().input_specs
+    for dim_list in self._GetParamsCached().input_dims:
+      assert len(input_specs) == len(dim_list)
       current_input_data = []
-      for i in range(len(params.input_names)):
-        dtype = input_dtypes[params.input_names[i]]
+      for spec, np_shape in zip(input_specs, dim_list):
+        np_dtype = spec.dtype.as_numpy_dtype()
         # Multiply the input by some constant to avoid all zeros input for
         # integer types.
-        scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
-        dims = inp[i]
+        scale = 10.0 if np.issubdtype(np_dtype, np.integer) else 1.0
         # TODO(laigd): add debug options. E.g. we can set the input data to be
         # continuous natural numbers:
-        # seq = np.arange(np.prod(dims))
-        # seq.resize(dims)
-        # input_data.append(scale * seq.astype(dtype))
+        # seq = np.arange(np.prod(np_shape))
+        # seq.resize(np_shape)
+        # inputs_data.append(scale * seq.astype(np_dtype))
         current_input_data.append(
-            (scale * np.random.random_sample(dims)).astype(dtype))
+            (scale * np.random.random_sample(np_shape)).astype(np_dtype))
       inputs_data.append(current_input_data)
 
     # Verify original graph.
-    self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
+    self._VerifyGraphDef(run_params, saved_model_dir, saved_model_dir,
+                         GraphState.ORIGINAL)
 
     # Run original graph without trt to get reference result.
     config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL)
     logging.info("Running original graph w/o trt, config:\n%s",
                  str(config_no_trt))
-    ref_result = self._RunGraph(run_params, input_gdef, inputs_data,
+    ref_result = self._RunGraph(run_params, saved_model_dir, inputs_data,
                                 config_no_trt, GraphState.ORIGINAL)
 
     # Run calibration if necessary.
-    if (IsQuantizationMode(run_params.precision_mode) and
-        run_params.use_calibration):
-      infer_gdef = self._GetCalibratedInferGraph(run_params, input_gdef,
-                                                 inputs_data)
-      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
-    elif not run_params.use_optimizer:
-      infer_gdef = self._GetInferGraph(run_params, input_gdef)
-      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
+    if IsQuantizationWithCalibration(run_params):
+      infer_saved_model_dir = self._GetCalibratedInferGraph(
+          run_params, saved_model_dir, inputs_data)
+      self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
+                           GraphState.INFERENCE)
+    elif not run_params.convert_online:
+      infer_saved_model_dir = self._GetInferGraph(run_params, saved_model_dir)
+      self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
+                           GraphState.INFERENCE)
     else:
-      infer_gdef = input_gdef
+      infer_saved_model_dir = saved_model_dir
 
     # Run inference.
     infer_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
     logging.info("Running final inference graph, config:\n%s",
                  str(infer_config))
-    result = self._RunGraph(run_params, infer_gdef, inputs_data, infer_config,
-                            GraphState.INFERENCE)
+    result = self._RunGraph(run_params, infer_saved_model_dir, inputs_data,
+                            infer_config, GraphState.INFERENCE)
     self.assertAllClose(
         ref_result,
         result,
@@ -525,59 +616,54 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     # Test that applying tensorrt optimizer or offline conversion tools multiple
     # times to the same graph will result in same graph.
     #
-    # TODO(aaroey): currently the conversion is not deterministic, this is
-    # mainly because during tensorflow::ConvertGraphDefToGraph(), the graph uses
-    # EdgeSet which use a map keyed by Edge*, so the order of input/output edges
-    # of a node is nondeterministic, thus the order for segmenter to contract
-    # edges is nondeterministic. Need to evaluate whether we should fix this.
+    # TODO(aaroey): implement this.
     pass
 
 
 def _AddTests(test_class):
   """Adds test methods to TfTrtIntegrationTestBase."""
 
+  def _GetTestConfigs():
+    """Returns the config combinations to run the test."""
+    convert_online, convert_offline = True, False
+    dynamic_engine, static_engine = True, False
+    use_calibration, no_calibration = True, False
+
+    # Add all possible test cases and let the derived test class to decide
+    # whether to run specific ones with ShouldRunTest().
+    #
+    # Note: INT8 without calibration behaves like FP32/FP16.
+    opts = list(
+        itertools.product([FP32, FP16, INT8], [convert_online, convert_offline],
+                          [dynamic_engine, static_engine], [no_calibration]))
+    # We always run calibration with offline tool.
+    # TODO(aaroey): static calibration engine is not supported yet.
+    opts.append((INT8, convert_offline, dynamic_engine, use_calibration))
+    return opts
+
   def _GetTest(run_params):
     """Gets a single test method based on the parameters."""
 
+    @test_util.deprecated_graph_mode_only
     def _Test(self):
       logging.info(
-          "Running test %s with parameters: use_optimizer=%s, "
+          "Running TFv1 test %s with parameters: convert_online=%s, "
           "precision_mode=%s, dynamic_engine=%s",
-          "testTfTrt_" + run_params.test_name, run_params.use_optimizer,
+          "testTfTrt_" + run_params.test_name, run_params.convert_online,
           run_params.precision_mode, run_params.dynamic_engine)
       self.RunTest(run_params)
 
     return _Test
 
-  use_optimizer_options = [False, True]
-  dynamic_engine_options = [False, True]
-  use_calibration_options = [False, True]
-  opts = itertools.product(use_optimizer_options, PRECISION_MODES,
-                           dynamic_engine_options, use_calibration_options)
-  for (use_optimizer, precision_mode, dynamic_engine, use_calibration) in opts:
-    if IsQuantizationMode(precision_mode):
-      if use_optimizer:
-        # We ignore the use_optimizer option and always use TrtGraphConverter
-        # for INT8 mode, so no need to run it twice.
-        continue
-      if use_calibration and not dynamic_engine:
-        # Static engine with use_calibration=False will be static, so we want to
-        # test that. If use_calibration=True, only dynamic op is supported.
-        # TODO(aaroey): construction of static calibration engine is not
-        # supported yet.
-        continue
-    else:
-      if use_calibration:
-        # Don't calibrate in FP32 or FP16 mode
-        continue
-
-    conversion = "OptimizerConversion" if use_optimizer else "ToolConversion"
+  opts = _GetTestConfigs()
+  for (precision_mode, convert_online, dynamic_engine, use_calibration) in opts:
+    conversion = "OnlineConversion" if convert_online else "OfflineConversion"
     engine_type = "DynamicEngine" if dynamic_engine else "StaticEngine"
     calibration_type = "UseCalibration" if use_calibration else "NoCalibration"
     test_name = "%s_%s_%s_%s" % (conversion, engine_type, precision_mode,
                                  calibration_type)
     run_params = RunParams(
-        use_optimizer=use_optimizer,
+        convert_online=convert_online,
         precision_mode=precision_mode,
         dynamic_engine=dynamic_engine,
         test_name=test_name,
diff --git a/tensorflow/python/compiler/tensorrt/test/topk_test.py b/tensorflow/python/compiler/tensorrt/test/topk_test.py
index 1e2bf3b65c3..99981d91348 100644
--- a/tensorflow/python/compiler/tensorrt/test/topk_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/topk_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import array_ops
@@ -28,26 +27,20 @@ from tensorflow.python.platform import test
 
 
 class TopKTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing Top-K in TF-TRT conversion."""
+
+  def GraphFn(self, x):
+    k = 5
+    k_tensor = constant_op.constant(k, dtype=dtypes.int32, name="Const")
+    values, indices = nn_ops.top_k(x, k_tensor, name="TopK")
+    values = array_ops.identity(values, name="output_0")
+    indices = array_ops.identity(indices, name="output_1")
+    return values, indices
 
   def GetParams(self):
-    """Testing Top-K in TF-TRT conversion."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [100, 100]
     k = 5
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      k_tensor = constant_op.constant(k, dtype=dtypes.int32, name="Const")
-      values, indices = nn_ops.top_k(x, k_tensor, name="TopK")
-      values = array_ops.identity(values, name="output_values")
-      indices = array_ops.identity(indices, name="output_indices")
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=["output_values", "output_indices"],
-        expected_output_dims=[[[100, k], [100, k]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[100, 100]],
+                            [[100, k], [100, k]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -55,30 +48,24 @@ class TopKTest(trt_test.TfTrtIntegrationTestBase):
 
 
 class TopKOutputTypeTest(trt_test.TfTrtIntegrationTestBase):
+  """Testing that output type of engine using Top-K is set correctly."""
+
+  def GraphFn(self, x):
+    k = 5
+    k_tensor = constant_op.constant(k, dtype=dtypes.int32, name="Const")
+    values, indices = nn_ops.top_k(x, k_tensor, name="TopK")
+    # Reshape will act as a layer between the TopK output and the engine
+    # output, requiring the output tensor of reshape to be set explicitly to
+    # int32.
+    indices = array_ops.reshape(indices, [100, 1, 5], name="Reshape")
+    values = array_ops.identity(values, name="output_0")
+    indices = array_ops.identity(indices, name="output_1")
+    return values, indices
 
   def GetParams(self):
-    """Testing that output type of engine using Top-K is set correctly."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [100, 100]
     k = 5
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      k_tensor = constant_op.constant(k, dtype=dtypes.int32, name="Const")
-      values, indices = nn_ops.top_k(x, k_tensor, name="TopK")
-      # Reshape will act as a layer between the TopK output and the engine
-      # output, requiring the output tensor of reshape to be set explicitly to
-      # int32.
-      indices = array_ops.reshape(indices, [100, 1, 5], name="Reshape")
-      values = array_ops.identity(values, name="output_values")
-      indices = array_ops.identity(indices, name="output_indices")
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=["output_values", "output_indices"],
-        expected_output_dims=[[[100, k], [100, 1, k]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[100, 100]],
+                            [[100, k], [100, 1, k]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/unary_test.py b/tensorflow/python/compiler/tensorrt/test/unary_test.py
index 83569bcfbf1..b88939d9b65 100644
--- a/tensorflow/python/compiler/tensorrt/test/unary_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/unary_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
@@ -32,77 +31,67 @@ from tensorflow.python.platform import test
 
 
 class UnaryTest(trt_test.TfTrtIntegrationTestBase):
+  """Test for unary operations in TF-TRT."""
+
+  def GraphFn(self, x1, x2):
+    x = x1
+    q = math_ops.abs(x)
+    q = q + 1.0
+    q = gen_math_ops.exp(q)
+    q = gen_math_ops.log(q)
+    q = array_ops.squeeze(q, axis=-2)
+    q = math_ops.abs(q)
+    q = q + 2.2
+    q = gen_math_ops.sqrt(q)
+    q = gen_math_ops.rsqrt(q)
+    q = math_ops.negative(q)
+    q = array_ops.squeeze(q, axis=3)
+    q = math_ops.abs(q)
+    q = q + 3.0
+    a = gen_math_ops.reciprocal(q)
+
+    x = constant_op.constant(np.random.randn(5, 8, 12), dtype=x.dtype)
+    q = math_ops.abs(x)
+    q = q + 2.0
+    q = gen_math_ops.exp(q)
+    q = gen_math_ops.log(q)
+    q = math_ops.abs(q)
+    q = q + 2.1
+    q = gen_math_ops.sqrt(q)
+    q = gen_math_ops.rsqrt(q)
+    q = math_ops.negative(q)
+    q = math_ops.abs(q)
+    q = q + 4.0
+    b = gen_math_ops.reciprocal(q)
+
+    # TODO(jie): this one will break, broadcasting on batch.
+    x = x2
+    q = math_ops.abs(x)
+    q = q + 5.0
+    q = gen_math_ops.exp(q)
+    q = array_ops.squeeze(q, axis=[-1, -2, 3])
+    q = gen_math_ops.log(q)
+    q = math_ops.abs(q)
+    q = q + 5.1
+    q = gen_array_ops.reshape(q, [12, 5, 1, 1, 8, 1, 12])
+    q = array_ops.squeeze(q, axis=[5, 2, 3])
+    q = gen_math_ops.sqrt(q)
+    q = math_ops.abs(q)
+    q = q + 5.2
+    q = gen_math_ops.rsqrt(q)
+    q = math_ops.negative(q)
+    q = math_ops.abs(q)
+    q = q + 5.3
+    c = gen_math_ops.reciprocal(q)
+
+    q = a * b
+    q = q / c
+    return array_ops.squeeze(q, name="output_0")
 
   def GetParams(self):
-    """Test for unary operations in TF-TRT."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [12, 5, 8, 1, 1, 12]
-    output_name = "output"
-    input2_name = "input_2"
-    input2_dims = [12, 5, 8, 1, 12, 1, 1]
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      q = math_ops.abs(x)
-      q = q + 1.0
-      q = gen_math_ops.exp(q)
-      q = gen_math_ops.log(q)
-      q = array_ops.squeeze(q, axis=-2)
-      q = math_ops.abs(q)
-      q = q + 2.2
-      q = gen_math_ops.sqrt(q)
-      q = gen_math_ops.rsqrt(q)
-      q = math_ops.negative(q)
-      q = array_ops.squeeze(q, axis=3)
-      q = math_ops.abs(q)
-      q = q + 3.0
-      a = gen_math_ops.reciprocal(q)
-
-      x = constant_op.constant(np.random.randn(5, 8, 12), dtype=dtype)
-      q = math_ops.abs(x)
-      q = q + 2.0
-      q = gen_math_ops.exp(q)
-      q = gen_math_ops.log(q)
-      q = math_ops.abs(q)
-      q = q + 2.1
-      q = gen_math_ops.sqrt(q)
-      q = gen_math_ops.rsqrt(q)
-      q = math_ops.negative(q)
-      q = math_ops.abs(q)
-      q = q + 4.0
-      b = gen_math_ops.reciprocal(q)
-
-      # TODO(jie): this one will break, broadcasting on batch.
-      x = array_ops.placeholder(
-          dtype=dtype, shape=input2_dims, name=input2_name)
-      q = math_ops.abs(x)
-      q = q + 5.0
-      q = gen_math_ops.exp(q)
-      q = array_ops.squeeze(q, axis=[-1, -2, 3])
-      q = gen_math_ops.log(q)
-      q = math_ops.abs(q)
-      q = q + 5.1
-      q = gen_array_ops.reshape(q, [12, 5, 1, 1, 8, 1, 12])
-      q = array_ops.squeeze(q, axis=[5, 2, 3])
-      q = gen_math_ops.sqrt(q)
-      q = math_ops.abs(q)
-      q = q + 5.2
-      q = gen_math_ops.rsqrt(q)
-      q = math_ops.negative(q)
-      q = math_ops.abs(q)
-      q = q + 5.3
-      c = gen_math_ops.reciprocal(q)
-
-      q = a * b
-      q = q / c
-      array_ops.squeeze(q, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name, input2_name],
-        input_dims=[[input_dims, input2_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[12, 5, 8, 12]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32,
+                            [[12, 5, 8, 1, 1, 12], [12, 5, 8, 1, 12, 1, 1]],
+                            [[12, 5, 8, 12]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index 97ee11747e8..368ffad30a4 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_impl
@@ -32,47 +31,39 @@ from tensorflow.python.platform import test
 
 
 class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
+  """Single vgg layer in NCHW unit tests in TF-TRT."""
+
+  def GraphFn(self, x):
+    dtype = x.dtype
+    x, _, _ = nn_impl.fused_batch_norm(
+        x, [1.0, 1.0], [0.0, 0.0],
+        mean=[0.5, 0.5],
+        variance=[1.0, 1.0],
+        data_format="NCHW",
+        is_training=False)
+    e = constant_op.constant(
+        np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
+    conv = nn.conv2d(
+        input=x,
+        filter=e,
+        data_format="NCHW",
+        strides=[1, 1, 2, 2],
+        padding="SAME",
+        name="conv")
+    b = constant_op.constant(np.random.randn(6), name="bias", dtype=dtype)
+    t = nn.bias_add(conv, b, data_format="NCHW", name="biasAdd")
+    relu = nn.relu(t, "relu")
+    idty = array_ops.identity(relu, "ID")
+    v = nn_ops.max_pool(
+        idty, [1, 1, 2, 2], [1, 1, 2, 2],
+        "VALID",
+        data_format="NCHW",
+        name="max_pool")
+    return array_ops.squeeze(v, name="output_0")
 
   def GetParams(self):
-    """Single vgg layer in NCHW unit tests in TF-TRT."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [5, 2, 8, 8]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      x, _, _ = nn_impl.fused_batch_norm(
-          x, [1.0, 1.0], [0.0, 0.0],
-          mean=[0.5, 0.5],
-          variance=[1.0, 1.0],
-          data_format="NCHW",
-          is_training=False)
-      e = constant_op.constant(
-          np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
-      conv = nn.conv2d(
-          input=x,
-          filter=e,
-          data_format="NCHW",
-          strides=[1, 1, 2, 2],
-          padding="SAME",
-          name="conv")
-      b = constant_op.constant(np.random.randn(6), name="bias", dtype=dtype)
-      t = nn.bias_add(conv, b, data_format="NCHW", name="biasAdd")
-      relu = nn.relu(t, "relu")
-      idty = array_ops.identity(relu, "ID")
-      v = nn_ops.max_pool(
-          idty, [1, 1, 2, 2], [1, 1, 2, 2],
-          "VALID",
-          data_format="NCHW",
-          name="max_pool")
-      array_ops.squeeze(v, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[5, 6, 2, 2]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[5, 2, 8, 8]],
+                            [[5, 6, 2, 2]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index a4fa1d67059..f1b41327a58 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_impl
@@ -32,38 +31,30 @@ from tensorflow.python.platform import test
 
 
 class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
+  """Single vgg layer test in TF-TRT conversion."""
+
+  def GraphFn(self, x):
+    dtype = x.dtype
+    x, _, _ = nn_impl.fused_batch_norm(
+        x, [1.0, 1.0], [0.0, 0.0],
+        mean=[0.5, 0.5],
+        variance=[1.0, 1.0],
+        is_training=False)
+    e = constant_op.constant(
+        np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
+    conv = nn.conv2d(
+        input=x, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
+    b = constant_op.constant(np.random.randn(6), name="bias", dtype=dtype)
+    t = nn.bias_add(conv, b, name="biasAdd")
+    relu = nn.relu(t, "relu")
+    idty = array_ops.identity(relu, "ID")
+    v = nn_ops.max_pool(
+        idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+    return array_ops.squeeze(v, name="output_0")
 
   def GetParams(self):
-    """Single vgg layer test in TF-TRT conversion."""
-    dtype = dtypes.float32
-    input_name = "input"
-    input_dims = [5, 8, 8, 2]
-    output_name = "output"
-    g = ops.Graph()
-    with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      x, _, _ = nn_impl.fused_batch_norm(
-          x, [1.0, 1.0], [0.0, 0.0],
-          mean=[0.5, 0.5],
-          variance=[1.0, 1.0],
-          is_training=False)
-      e = constant_op.constant(
-          np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
-      conv = nn.conv2d(
-          input=x, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
-      b = constant_op.constant(np.random.randn(6), name="bias", dtype=dtype)
-      t = nn.bias_add(conv, b, name="biasAdd")
-      relu = nn.relu(t, "relu")
-      idty = array_ops.identity(relu, "ID")
-      v = nn_ops.max_pool(
-          idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-      array_ops.squeeze(v, name=output_name)
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=g.as_graph_def(),
-        input_names=[input_name],
-        input_dims=[[input_dims]],
-        output_names=[output_name],
-        expected_output_dims=[[[5, 2, 2, 6]]])
+    return self.BuildParams(self.GraphFn, dtypes.float32, [[5, 8, 8, 2]],
+                            [[5, 2, 2, 6]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index c9348b79575..7bc922bc8d6 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -482,8 +482,6 @@ TrtConversionParams = collections.namedtuple(
 
         # Whether to generate dynamic TRT ops which will build the TRT network
         # and engine at run time.
-        #
-        # TODO(laigd): In TF 2.0, this options should only affect INT8 mode.
         "is_dynamic_op",
 
         # Max number of cached TRT engines in dynamic TRT ops. If the number of
@@ -742,6 +740,11 @@ class TrtGraphConverter(GraphConverter):
 
     self._need_calibration = (
         precision_mode == TrtPrecisionMode.INT8 and use_calibration)
+    if self._need_calibration and not is_dynamic_op:
+      tf_logging.warn(
+          "INT8 precision mode with calibration is supported with "
+          "dynamic TRT ops only. Disregarding is_dynamic_op parameter.")
+      is_dynamic_op = True
 
     # TODO(laigd): consider provide a mechanism to remove the fallback path
     # after calibration is done.
@@ -853,8 +856,7 @@ class TrtGraphConverterV2(object):
   precision modes):
 
   ```python
-  TrtConversionParams params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
-      precision_mode='FP16')
+  params = DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode='FP16')
   converter = TrtGraphConverterV2(
       input_saved_model_dir="my_dir", conversion_params=params)
   converter.convert()
@@ -868,7 +870,7 @@ class TrtGraphConverterV2(object):
   function with some input data:
 
   ```python
-  TrtConversionParams params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
+  params = DEFAULT_TRT_CONVERSION_PARAMS._replace(
       precision_mode='FP16',
       # Set this to a large enough number so it can cache all the TRT engines.
       maximum_cached_engines=16)
@@ -908,10 +910,15 @@ class TrtGraphConverterV2(object):
       input_saved_model_signature_key: the key of the signature to optimize the
         graph for.
       conversion_params: a TrtConversionParams instance.
+
+    Raises:
+      ValueError: if the combination of the parameters is invalid.
     """
     assert context.executing_eagerly()
     _check_trt_version_compatibility()
+    _check_conversion_params(conversion_params)
 
+    self._conversion_params = conversion_params
     self._input_saved_model_dir = input_saved_model_dir
     self._input_saved_model_tags = (
         input_saved_model_tags or [tag_constants.SERVING])
@@ -922,8 +929,10 @@ class TrtGraphConverterV2(object):
     self._need_calibration = (
         conversion_params.precision_mode == TrtPrecisionMode.INT8 and
         conversion_params.use_calibration)
-    self._conversion_params = conversion_params
-    _check_conversion_params(self._conversion_params)
+    if (self._need_calibration and not conversion_params.is_dynamic_op):
+      raise ValueError("INT8 precision mode with calibration is not supported "
+                       "with static TensorRT ops. Set is_dynamic_op to True.")
+
     self._converted = False
 
   def _run_conversion(self, meta_graph_def):
@@ -991,14 +1000,6 @@ class TrtGraphConverterV2(object):
     """
     assert self._converted
 
-    @def_function.function
-    def _dump_trt_cache(resource_name, filename):
-      gen_trt_ops.dump_trt_engine_cache(
-          container=_TRT_ENGINE_CACHE_CONTAINER_NAME,
-          resource_name=resource_name,
-          filename=filename,
-          delete_cache_after_dump=True)
-
     # Serialize the TRT engines in the cache if any, and create trackable
     # resource to track them.
     engine_asset_dir = tempfile.mkdtemp()
@@ -1013,7 +1014,11 @@ class TrtGraphConverterV2(object):
       filename = os.path.join(engine_asset_dir,
                               "trt-serialized-engine." + canonical_engine_name)
       try:
-        _dump_trt_cache(canonical_engine_name, filename)
+        gen_trt_ops.dump_trt_engine_cache(
+            container=_TRT_ENGINE_CACHE_CONTAINER_NAME,
+            resource_name=canonical_engine_name,
+            filename=filename,
+            delete_cache_after_dump=True)
       except errors.NotFoundError:
         # If user haven't run the function to populate the engine, it's fine,
         # and we don't need to track any serialized TRT engines.
diff --git a/tensorflow/python/compiler/xla/BUILD b/tensorflow/python/compiler/xla/BUILD
index 87ea46fa261..820f8a96462 100644
--- a/tensorflow/python/compiler/xla/BUILD
+++ b/tensorflow/python/compiler/xla/BUILD
@@ -40,6 +40,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
     xla_enabled = True,
 )
 
@@ -86,5 +87,6 @@ cuda_py_test(
         "no_mac",
         "no_windows",
     ],
+    xla_enable_strict_auto_jit = True,
     xla_enabled = True,
 )
diff --git a/tensorflow/python/data/BUILD b/tensorflow/python/data/BUILD
index e32eeecbb86..d4f4f8055d7 100644
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 8e06bd33ebd..0882553f430 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/data/benchmarks/list_files_benchmark.py b/tensorflow/python/data/benchmarks/list_files_benchmark.py
index 70f8eeec9e8..381fe4835ec 100644
--- a/tensorflow/python/data/benchmarks/list_files_benchmark.py
+++ b/tensorflow/python/data/benchmarks/list_files_benchmark.py
@@ -35,7 +35,7 @@ from tensorflow.python.platform import test
 class ListFilesBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.Dataset.list_files()`."""
 
-  def benchmarkNestedDirectories(self):
+  def benchmark_nested_directories(self):
     tmp_dir = tempfile.mkdtemp()
     width = 1024
     depth = 16
diff --git a/tensorflow/python/data/benchmarks/meta_benchmark.py b/tensorflow/python/data/benchmarks/meta_benchmark.py
index c6d888b2df0..f03e6efa600 100644
--- a/tensorflow/python/data/benchmarks/meta_benchmark.py
+++ b/tensorflow/python/data/benchmarks/meta_benchmark.py
@@ -40,19 +40,19 @@ class MetaBenchmark(test.Benchmark):
     options.experimental_optimization.apply_default_optimizations = False
     return dataset_ops.Dataset.range(10000**2).with_options(options)
 
-  def benchmarkFastDatasetWithOnlyCppIterations(self):
+  def benchmark_fast_dataset_with_only_cpp_iterations(self):
     dataset = self.setup_fast_dataset()
     self.run_benchmark_with_only_cpp_iterations(dataset)
 
-  def benchmarkFastDatasetWithSessionRun(self):
+  def benchmark_fast_dataset_with_session_run(self):
     dataset = self.setup_fast_dataset()
     self.run_benchmark_with_session_run(dataset)
 
-  def benchmarkFastDatasetWithSessionCallable(self):
+  def benchmark_fast_dataset_with_session_callable(self):
     dataset = self.setup_fast_dataset()
     self.run_benchmark_with_session_run(dataset, make_callable=True)
 
-  def benchmarkFastDatasetInEager(self):
+  def benchmark_fast_dataset_in_eager(self):
     with context.eager_mode():
       dataset = self.setup_fast_dataset()
       self.run_benchmark_in_eager(dataset)
@@ -63,19 +63,19 @@ class MetaBenchmark(test.Benchmark):
     # sleep for 1e-3s per iteration
     return dataset.apply(sleep.sleep(1000))
 
-  def benchmarkSlowDatasetWithOnlyCppIterations(self):
+  def benchmark_slow_dataset_with_only_cpp_iterations(self):
     dataset = self.setup_slow_dataset()
     self.run_benchmark_with_only_cpp_iterations(dataset)
 
-  def benchmarkSlowDatasetWithSessionRun(self):
+  def benchmark_slow_dataset_with_session_run(self):
     dataset = self.setup_slow_dataset()
     self.run_benchmark_with_session_run(dataset)
 
-  def benchmarkSlowDatasetWithSessionCallable(self):
+  def benchmark_slow_dataset_with_session_callable(self):
     dataset = self.setup_slow_dataset()
     self.run_benchmark_with_session_run(dataset, make_callable=True)
 
-  def benchmarkSlowDatasetInEager(self):
+  def benchmark_slow_dataset_in_eager(self):
     with context.eager_mode():
       dataset = self.setup_slow_dataset()
       self.run_benchmark_in_eager(dataset)
diff --git a/tensorflow/python/data/experimental/BUILD b/tensorflow/python/data/experimental/BUILD
index 84e761d3763..5264b4a7791 100644
--- a/tensorflow/python/data/experimental/BUILD
+++ b/tensorflow/python/data/experimental/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index 95b0c340824..7b9d809ea89 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
index e0e6f88cd04..33606847866 100644
--- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -31,12 +31,12 @@ from tensorflow.python.platform import test
 class AutotuneBenchmark(test.Benchmark):
   """Benchmarks for autotuning performance knobs."""
 
-  def benchmarkMap(self):
-    a = self._benchmarkMap(autotune=False)
-    b = self._benchmarkMap(autotune=True)
+  def benchmark_map(self):
+    a = self._benchmark_map(autotune=False)
+    b = self._benchmark_map(autotune=True)
     print("speedup: %f" % (a / b))
 
-  def _benchmarkMap(self, autotune):
+  def _benchmark_map(self, autotune):
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
                                                 np.random.rand(4 * k,
@@ -66,12 +66,12 @@ class AutotuneBenchmark(test.Benchmark):
         name="map" + ("_autotune" if autotune else ""))
     return np.median(deltas)
 
-  def benchmarkMapAndBatch(self):
-    a = self._benchmarkMapAndBatch(autotune=False)
-    b = self._benchmarkMapAndBatch(autotune=True)
+  def benchmark_map_and_batch(self):
+    a = self._benchmark_map_and_batch(autotune=False)
+    b = self._benchmark_map_and_batch(autotune=True)
     print("speedup: %f" % (a / b))
 
-  def _benchmarkMapAndBatch(self, autotune):
+  def _benchmark_map_and_batch(self, autotune):
     batch_size = 16
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
@@ -104,12 +104,12 @@ class AutotuneBenchmark(test.Benchmark):
         name="map_and_batch" + ("_autotune" if autotune else ""))
     return np.median(deltas)
 
-  def benchmarkInterleave(self):
-    a = self._benchmarkInterleave(autotune=False)
-    b = self._benchmarkInterleave(autotune=True)
+  def benchmark_interleave(self):
+    a = self._benchmark_interleave(autotune=False)
+    b = self._benchmark_interleave(autotune=True)
     print("speedup: %f" % (a / b))
 
-  def _benchmarkInterleave(self, autotune):
+  def _benchmark_interleave(self, autotune):
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
                                                 np.random.rand(4 * k,
@@ -142,12 +142,12 @@ class AutotuneBenchmark(test.Benchmark):
         name="interleave" + ("_autotune" if autotune else ""))
     return np.median(deltas)
 
-  def benchmarkMapAndInterleave(self):
-    a = self._benchmarkMapAndInterleave(autotune=False)
-    b = self._benchmarkMapAndInterleave(autotune=True)
+  def benchmark_map_and_interleave(self):
+    a = self._benchmark_map_and_interleave(autotune=False)
+    b = self._benchmark_map_and_interleave(autotune=True)
     print("speedup: %f" % (a / b))
 
-  def _benchmarkMapAndInterleave(self, autotune):
+  def _benchmark_map_and_interleave(self, autotune):
     k = 1024 * 1024
     a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
     b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
diff --git a/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py b/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py
index 4a5a264c6f3..258c54dbe2f 100644
--- a/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py
@@ -31,7 +31,7 @@ from tensorflow.python.platform import test
 class ChooseFastestBenchmark(test.Benchmark):
   """Benchmarks for static optimizations."""
 
-  def benchmarkChooseFastest(self):
+  def benchmark_choose_fastest(self):
 
     dataset = dataset_ops.Dataset.range(1000**2).repeat()
     options = dataset_ops.Options()
@@ -46,7 +46,7 @@ class ChooseFastestBenchmark(test.Benchmark):
     self._benchmark(batch_map_dataset, "batch_map_dataset")
     self._benchmark(merge_dataset, "merge_dataset")
 
-  def benchmarkChooseFastestFirstNIterations(self):
+  def benchmark_choose_fastest_first_n_iterations(self):
 
     dataset = dataset_ops.Dataset.range(1000**2).repeat()
     options = dataset_ops.Options()
@@ -58,11 +58,11 @@ class ChooseFastestBenchmark(test.Benchmark):
     merge_dataset = optimization._ChooseFastestDataset(  # pylint: disable=protected-access
         [batch_map_dataset, map_batch_dataset])
 
-    self._benchmarkFirstN(map_batch_dataset, "map_batch_dataset")
-    self._benchmarkFirstN(batch_map_dataset, "batch_map_dataset")
-    self._benchmarkFirstN(merge_dataset, "merge_dataset")
+    self._benchmark_first_n(map_batch_dataset, "map_batch_dataset")
+    self._benchmark_first_n(batch_map_dataset, "batch_map_dataset")
+    self._benchmark_first_n(merge_dataset, "merge_dataset")
 
-  def _benchmarkFirstN(self, dataset, name):
+  def _benchmark_first_n(self, dataset, name):
     n = 10  # The default num_experiments for ChooseFastestDataset
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
diff --git a/tensorflow/python/data/experimental/benchmarks/choose_fastest_branch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/choose_fastest_branch_benchmark.py
index e76235f8255..2b7761c1fca 100644
--- a/tensorflow/python/data/experimental/benchmarks/choose_fastest_branch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/choose_fastest_branch_benchmark.py
@@ -53,7 +53,7 @@ class ChooseFastestBranchBenchmark(benchmark_base.DatasetBenchmarkBase):
 
     return self.make_benchmark_datasets(dataset, branch_0, branch_1, 100)
 
-  def benchmarkChooseFastest(self):
+  def benchmark_choose_fastest(self):
     map_batch, batch_map, choose_fastest = self.make_simple_benchmark_datasets()
 
     def benchmark(dataset, name):
@@ -63,7 +63,7 @@ class ChooseFastestBranchBenchmark(benchmark_base.DatasetBenchmarkBase):
     benchmark(batch_map, "batch_map_dataset")
     benchmark(choose_fastest, "choose_fastest_dataset")
 
-  def benchmarkChooseFastestFirstNIterations(self):
+  def benchmark_choose_fastest_first_n_iterations(self):
 
     map_batch, batch_map, choose_fastest = self.make_simple_benchmark_datasets()
 
@@ -75,7 +75,7 @@ class ChooseFastestBranchBenchmark(benchmark_base.DatasetBenchmarkBase):
     benchmark(batch_map, "batch_map_dataset")
     benchmark(choose_fastest, "choose_fastest_dataset")
 
-  def benchmarkWithInputSkew(self):
+  def benchmark_with_input_skew(self):
 
     def make_dataset(time_us, num_elements):
       return dataset_ops.Dataset.range(num_elements).apply(sleep.sleep(time_us))
diff --git a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
index 2e91e08c79f..10bddb0ab83 100644
--- a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
@@ -41,7 +41,7 @@ class CsvDatasetBenchmark(test.Benchmark):
   FLOAT_VAL = '1.23456E12'
   STR_VAL = string.ascii_letters * 10
 
-  def _setUp(self, str_val):
+  def _set_up(self, str_val):
     # Since this isn't test.TestCase, have to manually create a test dir
     gfile.MakeDirs(googletest.GetTempDir())
     self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
@@ -58,10 +58,10 @@ class CsvDatasetBenchmark(test.Benchmark):
         f.write('\n'.join([row for _ in range(100)]))
       self._filenames.append(fn)
 
-  def _tearDown(self):
+  def _tear_down(self):
     gfile.DeleteRecursively(self._temp_dir)
 
-  def _runBenchmark(self, dataset, num_cols, prefix):
+  def _run_benchmark(self, dataset, num_cols, prefix):
     dataset = dataset.skip(self._num_per_iter - 1)
     options = dataset_ops.Options()
     options.experimental_optimization.apply_default_optimizations = False
@@ -87,45 +87,45 @@ class CsvDatasetBenchmark(test.Benchmark):
         wall_time=median_wall_time,
         name='%s_with_cols_%d' % (prefix, num_cols))
 
-  def benchmarkMapWithFloats(self):
-    self._setUp(self.FLOAT_VAL)
+  def benchmark_map_with_floats(self):
+    self._set_up(self.FLOAT_VAL)
     for i in range(len(self._filenames)):
       num_cols = self._num_cols[i]
       kwargs = {'record_defaults': [[0.0]] * num_cols}
       dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
       dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
-    self._tearDown()
+      self._run_benchmark(dataset, num_cols, 'csv_float_map_decode_csv')
+    self._tear_down()
 
-  def benchmarkMapWithStrings(self):
-    self._setUp(self.STR_VAL)
+  def benchmark_map_with_strings(self):
+    self._set_up(self.STR_VAL)
     for i in range(len(self._filenames)):
       num_cols = self._num_cols[i]
       kwargs = {'record_defaults': [['']] * num_cols}
       dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
       dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
-    self._tearDown()
+      self._run_benchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
+    self._tear_down()
 
-  def benchmarkCsvDatasetWithFloats(self):
-    self._setUp(self.FLOAT_VAL)
+  def benchmark_csv_dataset_with_floats(self):
+    self._set_up(self.FLOAT_VAL)
     for i in range(len(self._filenames)):
       num_cols = self._num_cols[i]
       kwargs = {'record_defaults': [[0.0]] * num_cols}
       dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
       dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
-    self._tearDown()
+      self._run_benchmark(dataset, num_cols, 'csv_float_fused_dataset')
+    self._tear_down()
 
-  def benchmarkCsvDatasetWithStrings(self):
-    self._setUp(self.STR_VAL)
+  def benchmark_csv_dataset_with_strings(self):
+    self._set_up(self.STR_VAL)
     for i in range(len(self._filenames)):
       num_cols = self._num_cols[i]
       kwargs = {'record_defaults': [['']] * num_cols}
       dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
       dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
-    self._tearDown()
+      self._run_benchmark(dataset, num_cols, 'csv_strings_fused_dataset')
+    self._tear_down()
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
index 4b7c1737863..d6950a0ad89 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
@@ -40,7 +40,7 @@ _NUMPY_RANDOM_SEED = 42
 class MapAndBatchBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.experimental.map_and_batch()`."""
 
-  def benchmarkMapAndBatch(self):
+  def benchmark_map_and_batch(self):
     """Measures the performance of parallelized batching."""
     shapes = [(), (10,), (10, 10), (10, 10, 10), (224, 224, 3)]
     batch_size_values = [1, 32, 64, 128, 1024]
@@ -94,7 +94,7 @@ class MapAndBatchBenchmark(test.Benchmark):
             iters=iters, wall_time=median_wall_time,
             name="num_elements_%d_batch_size_%d" % (np.prod(shape), batch_size))
 
-  def benchmarkMapAndBatchChainingVersusFusing(self):
+  def benchmark_map_and_batch_chaining_versus_fusing(self):
     """Compares the performance of chaining and fusing map and batch.
 
     NOTE: It is recommended to build the benchmark with
diff --git a/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
index ac6d7d03602..04229bef3bc 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
@@ -49,7 +49,7 @@ class MapDefunBenchmark(test.Benchmark):
           wall_time=mean_us,
           extras={"examples_per_sec": num_iters / (end - start)})
 
-  def benchmarkDefunVsMapFn(self):
+  def benchmark_defun_vs_map_fn(self):
     """Benchmarks to compare the performance of MapDefun vs tf.map_fn."""
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
index 22f5267f607..8f1c6586cf4 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -144,32 +144,32 @@ class MapVectorizationBenchmark(test.Benchmark):
                                  (unoptimized_time / optimized_time)))
 
   # Known cheap functions
-  def benchmarkIdentity(self):
+  def benchmark_identity(self):
     self._benchmark_helper(lambda *args: [array_ops.identity(x) for x in args],
                            "identity")
 
-  def benchmarkAddConst(self):
+  def benchmark_add_const(self):
     self._benchmark_helper(lambda *args: [x + 1 for x in args], "add_const")
 
-  def benchmarkReturnConst(self):
+  def benchmark_return_const(self):
     self._benchmark_helper(lambda *args: [constant_op.constant(2)], "ret_const")
 
-  def benchmarkSelect(self):
+  def benchmark_select(self):
     self._benchmark_helper(lambda *args: args[0], "select")
 
-  def benchmarkCast(self):
+  def benchmark_cast(self):
     self._benchmark_helper(
         lambda *args: [math_ops.cast(x, dtypes.float32) for x in args], "cast")
 
-  def benchmarkReshape(self):
+  def benchmark_reshape(self):
     self._benchmark_helper(
         lambda *args: [array_ops.reshape(x, (-1, 30)) for x in args], "reshape")
 
-  def benchmarkDecodeCSV(self):
+  def benchmark_decode_csv(self):
     csv_fn, csv_factory = _generate_csv_test_case()
     self._benchmark_helper(csv_fn, "decode_csv", lambda: [csv_factory()])
 
-  def benchmarkParseSingleExample(self):
+  def benchmark_parse_single_example(self):
     # NOTE: Since we haven't implemented a vectorizer for "SerializeSparse",
     # this function is only naively vectorized.
     parse_fn, parse_factory = _generate_parse_single_example_test_case()
diff --git a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
index cb5bf2946d5..63c5aa392e5 100644
--- a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
@@ -35,7 +35,7 @@ from tensorflow.python.platform import test
 class MatchingFilesBenchmark(test.Benchmark):
   """Benchmark for the experimental `MatchingFilesDataset`."""
 
-  def benchmarkNestedDirectories(self):
+  def benchmark_nested_directories(self):
     tmp_dir = tempfile.mkdtemp()
     width = 500
     depth = 10
diff --git a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
index 395a529f853..bb0c3ed2be7 100644
--- a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
@@ -32,15 +32,15 @@ from tensorflow.python.platform import test
 class OptimizationBenchmark(test.Benchmark):
   """Benchmarks for static optimizations."""
 
-  def benchmarkMapFusion(self):
+  def benchmark_map_fusion(self):
     """Evaluates performance map of fusion."""
 
     chain_lengths = [0, 1, 2, 5, 10, 20, 50]
     for chain_length in chain_lengths:
-      self._benchmarkMapFusion(chain_length, False)
-      self._benchmarkMapFusion(chain_length, True)
+      self._benchmark_map_fusion(chain_length, False)
+      self._benchmark_map_fusion(chain_length, True)
 
-  def _benchmarkMapFusion(self, chain_length, optimize_dataset):
+  def _benchmark_map_fusion(self, chain_length, optimize_dataset):
     with ops.Graph().as_default():
       dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
       for _ in range(chain_length):
@@ -73,15 +73,15 @@ class OptimizationBenchmark(test.Benchmark):
             name="map_fusion_{}_chain_length_{}".format(
                 opt_mark, chain_length))
 
-  def benchmarkMapAndFilterFusion(self):
+  def benchmark_map_and_filter_fusion(self):
     """Evaluates performance map of fusion."""
 
     chain_lengths = [0, 1, 2, 5, 10, 20, 50]
     for chain_length in chain_lengths:
-      self._benchmarkMapAndFilterFusion(chain_length, False)
-      self._benchmarkMapAndFilterFusion(chain_length, True)
+      self._benchmark_map_and_filter_fusion(chain_length, False)
+      self._benchmark_map_and_filter_fusion(chain_length, True)
 
-  def _benchmarkMapAndFilterFusion(self, chain_length, optimize_dataset):
+  def _benchmark_map_and_filter_fusion(self, chain_length, optimize_dataset):
     with ops.Graph().as_default():
       dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
       for _ in range(chain_length):
@@ -116,13 +116,13 @@ class OptimizationBenchmark(test.Benchmark):
 
   # This benchmark compares the performance of pipeline with multiple chained
   # filter with and without filter fusion.
-  def benchmarkFilterFusion(self):
+  def benchmark_filter_fusion(self):
     chain_lengths = [0, 1, 2, 5, 10, 20, 50]
     for chain_length in chain_lengths:
-      self._benchmarkFilterFusion(chain_length, False)
-      self._benchmarkFilterFusion(chain_length, True)
+      self._benchmark_filter_fusion(chain_length, False)
+      self._benchmark_filter_fusion(chain_length, True)
 
-  def _benchmarkFilterFusion(self, chain_length, optimize_dataset):
+  def _benchmark_filter_fusion(self, chain_length, optimize_dataset):
     with ops.Graph().as_default():
       dataset = dataset_ops.Dataset.from_tensors(5).repeat(None)
       for _ in range(chain_length):
diff --git a/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py b/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
index 9a8ac7ef655..28253b19ece 100644
--- a/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
@@ -56,7 +56,7 @@ def _time_resampling(data_np, target_dist, init_dist, num_to_sample):  # pylint:
 class RejectionResampleBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.experimental.rejection_resample()`."""
 
-  def benchmarkResamplePerformance(self):
+  def benchmark_resample_performance(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
     num_classes = len(init_dist)
diff --git a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
index 79b93c8e3b3..92bc965f469 100644
--- a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
@@ -42,7 +42,8 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
     os.mkdir(tmp_dir)
     return tmp_dir
 
-  def _createSimpleDataset(self, num_elems, tmp_dir=None):
+  def _createSimpleDataset(self, num_elems, tmp_dir=None,
+                           compression=snapshot.COMPRESSION_NONE):
     if not tmp_dir:
       tmp_dir = self._makeSnapshotDirectory()
 
@@ -50,7 +51,7 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
     dataset = dataset.map(
         lambda x: gen_array_ops.broadcast_to(x, [50, 50, 3]))
     dataset = dataset.repeat(num_elems)
-    dataset = dataset.apply(snapshot.snapshot(tmp_dir))
+    dataset = dataset.apply(snapshot.snapshot(tmp_dir, compression=compression))
 
     return dataset
 
@@ -63,6 +64,14 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
       except errors.OutOfRangeError:
         pass
 
+  def benchmarkWriteSnapshotGzipCompression(self):
+    num_elems = 500000
+    dataset = self._createSimpleDataset(
+        num_elems, compression=snapshot.COMPRESSION_GZIP)
+
+    self.run_and_report_benchmark(dataset, num_elems, "write_gzip",
+                                  warmup=False, iters=1)
+
   def benchmarkWriteSnapshotSimple(self):
     num_elems = 500000
     dataset = self._createSimpleDataset(num_elems)
@@ -93,6 +102,15 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
 
     self.run_and_report_benchmark(dataset, num_elems, "read_simple")
 
+  def benchmarkReadSnapshotGzipCompression(self):
+    num_elems = 100000
+    tmp_dir = self._makeSnapshotDirectory()
+    dataset = self._createSimpleDataset(
+        num_elems, tmp_dir, compression=snapshot.COMPRESSION_GZIP)
+
+    self._consumeDataset(dataset, num_elems)
+    self.run_and_report_benchmark(dataset, num_elems, "read_gzip")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
index 3f5b9b91307..44ca6d9f39b 100644
--- a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
@@ -33,7 +33,7 @@ from tensorflow.python.platform import test
 class UnbatchBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.experimental.unbatch()`."""
 
-  def benchmarkNativeUnbatch(self):
+  def benchmark_native_unbatch(self):
     batch_sizes = [1, 2, 5, 10, 20, 50]
     elems_per_trial = 10000
     with ops.Graph().as_default():
@@ -70,7 +70,7 @@ class UnbatchBenchmark(test.Benchmark):
   # Include a benchmark of the previous `unbatch()` implementation that uses
   # a composition of more primitive ops. Eventually we'd hope to generate code
   # that is as good in both cases.
-  def benchmarkOldUnbatchImplementation(self):
+  def benchmark_old_unbatch_implementation(self):
     batch_sizes = [1, 2, 5, 10, 20, 50]
     elems_per_trial = 10000
     with ops.Graph().as_default():
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index d90c7a99176..19c629e406c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -1,9 +1,10 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -726,33 +727,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "unbatch_test",
-    size = "medium",
-    srcs = ["unbatch_test.py"],
-    python_version = "PY2",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/ops/ragged",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 py_test(
     name = "unique_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index f93f8f6686b..91a1e10e007 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -233,10 +233,6 @@ class MapDefunTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(r, feed_dict={p: 0})
 
-  def _assert_op_cancelled(self, sess, map_defun_op):
-    with self.assertRaisesRegexp(errors.CancelledError, "was cancelled"):
-      self.evaluate(map_defun_op)
-
   def testMapDefunWithParentCancellation(self):
     # Checks that a cancellation of the parent graph is threaded through to
     # MapDefunOp correctly.
@@ -252,7 +248,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
     with self.cached_session() as sess:
       thread = self.checkedThread(
-          self._assert_op_cancelled, args=(sess, map_defun_op))
+          self.assert_op_cancelled, args=(map_defun_op,))
       thread.start()
       time.sleep(0.2)
       sess.close()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index d775e792daf..d0dfbad4f65 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index 7b0cc569734..c1c5e736a88 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -80,8 +80,8 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(*_map_and_filter_fusion_test_cases())
   def testMapFilterFusion(self, function, predicate):
     dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(
-            ["Map", "FilterByLastComponent"])).map(function).filter(predicate)
+        optimization.assert_next(["Map", "Filter",
+                                  "Map"])).map(function).filter(predicate)
     options = dataset_ops.Options()
     options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_and_filter_fusion = True
diff --git a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index 88c14f0a6ea..cf76a73eee6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
@@ -46,7 +45,8 @@ class RestructuredDatasetTest(test_base.DatasetTestBase):
 
     for new_types, new_shape_lists in test_cases:
       # pylint: disable=protected-access
-      new = batching._RestructuredDataset(dataset, new_types, new_shape_lists)
+      new = dataset_ops._RestructuredDataset(dataset, new_types,
+                                             new_shape_lists)
       # pylint: enable=protected-access
       self.assertEqual(new_types, dataset_ops.get_legacy_output_types(new))
       if new_shape_lists is not None:
@@ -67,7 +67,8 @@ class RestructuredDatasetTest(test_base.DatasetTestBase):
     for new_types, new_shape_lists in fail_cases:
       with self.assertRaises(ValueError):
         # pylint: disable=protected-access
-        new = batching._RestructuredDataset(dataset, new_types, new_shape_lists)
+        new = dataset_ops._RestructuredDataset(dataset, new_types,
+                                               new_shape_lists)
         # pylint: enable=protected-access
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index b566bb68ad9..774ff9382ac 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index d21a4814017..84c43556b46 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import snapshot
@@ -29,7 +30,8 @@ from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase):
+class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
+                          parameterized.TestCase):
 
   def setUp(self):
     super(SnapshotDatasetTest, self).setUp()
@@ -54,19 +56,19 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase):
   def assertSnapshotDirectoryContains(
       self, directory, num_fingerprints, num_runs_per_fp, num_snapshot_files):
     dirlist = os.listdir(directory)
-    self.assertEqual(len(dirlist), num_fingerprints)
+    self.assertLen(dirlist, num_fingerprints)
 
     for i in range(num_fingerprints):
       fingerprint_dir = os.path.join(directory, dirlist[i])
       fingerprint_dir_list = sorted(os.listdir(fingerprint_dir))
-      self.assertEqual(len(fingerprint_dir_list), num_runs_per_fp + 1)
+      self.assertLen(fingerprint_dir_list, num_runs_per_fp + 1)
       self.assertEqual(fingerprint_dir_list[num_runs_per_fp],
                        "snapshot.metadata")
 
       for j in range(num_runs_per_fp):
         run_dir = os.path.join(fingerprint_dir, fingerprint_dir_list[j])
         run_dirlist = sorted(os.listdir(run_dir))
-        self.assertEqual(len(run_dirlist), num_snapshot_files)
+        self.assertLen(run_dirlist, num_snapshot_files)
 
         file_counter = 0
         for filename in run_dirlist:
@@ -105,11 +107,13 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase):
     # one that lost the race would be in passthrough mode.
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
-  def testWriteSnapshotSimpleSuccessful(self):
+  @parameterized.parameters(snapshot.COMPRESSION_NONE,
+                            snapshot.COMPRESSION_GZIP)
+  def testWriteSnapshotSimpleSuccessful(self, compression):
     tmpdir = self.makeSnapshotDirectory()
 
     dataset = dataset_ops.Dataset.range(1000)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset, list(range(1000)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@@ -124,16 +128,9 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase):
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
 
-  def testWriteSnapshotMultiFileSuccessful(self):
-    tmpdir = self.makeSnapshotDirectory()
-
-    dataset = dataset_ops.Dataset.range(20000)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
-    self.assertDatasetProduces(dataset, list(range(20000)))
-
-    self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 2)
-
-  def testReadSnapshotBackAfterWrite(self):
+  @parameterized.parameters(snapshot.COMPRESSION_NONE,
+                            snapshot.COMPRESSION_GZIP)
+  def testReadSnapshotBackAfterWrite(self, compression):
     self.setUpTFRecord()
     filenames = self.test_filenames
 
@@ -145,14 +142,15 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase):
 
     tmpdir = self.makeSnapshotDirectory()
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
     self.removeTFRecords()
 
     dataset2 = core_readers._TFRecordDataset(filenames)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.snapshot(
+        tmpdir, compression=compression))
     self.assertDatasetProduces(dataset2, expected)
 
   def testAdditionalOperationsAfterReadBack(self):
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index c5b4508b564..a581bc9fccc 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index 202a80a7779..2be96fe4878 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
-from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -199,6 +198,7 @@ def map_and_batch(map_func,
   return _apply_fn
 
 
+@deprecation.deprecated(None, "Use `tf.data.Dataset.unbatch()`.")
 @tf_export("data.experimental.unbatch")
 def unbatch():
   """Splits elements of a dataset into multiple elements on the batch dimension.
@@ -223,34 +223,7 @@ def unbatch():
   """
 
   def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-
-    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
-    # are normalized to the rank-1 dense representation, so that the
-    # sparse-oblivious unbatching logic will slice them
-    # appropriately. This leads to a somewhat inefficient re-encoding step
-    # for all SparseTensor components.
-    # TODO(mrry): Consider optimizing this in future if it turns out to be
-    # a bottleneck.
-    def normalize(arg, *rest):
-      # pylint: disable=protected-access
-      if rest:
-        return dataset._element_structure._to_batched_tensor_list((arg,) + rest)
-      else:
-        return dataset._element_structure._to_batched_tensor_list(arg)
-
-    normalized_dataset = dataset.map(normalize)
-
-    # NOTE(mrry): Our `map()` has lost information about the sparseness
-    # of any SparseTensor components, so re-apply the structure of the
-    # original dataset.
-    restructured_dataset = _RestructuredDataset(
-        normalized_dataset,
-        dataset_ops.get_legacy_output_types(dataset),
-        dataset_ops.get_legacy_output_shapes(dataset),
-        dataset_ops.get_legacy_output_classes(dataset),
-        allow_unsafe_cast=True)
-    return _UnbatchDataset(restructured_dataset)
+    return dataset.unbatch()
 
   return _apply_fn
 
@@ -330,120 +303,3 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
   @property
   def _element_structure(self):
     return self._structure
-
-
-class _RestructuredDataset(dataset_ops.UnaryDataset):
-  """An internal helper for changing the structure and shape of a dataset."""
-
-  def __init__(self,
-               dataset,
-               output_types,
-               output_shapes=None,
-               output_classes=None,
-               allow_unsafe_cast=False):
-    """Creates a new dataset with the given output types and shapes.
-
-    The given `dataset` must have a structure that is convertible:
-    * `dataset.output_types` must be the same as `output_types` module nesting.
-    * Each shape in `dataset.output_shapes` must be compatible with each shape
-      in `output_shapes` (if given).
-
-    Note: This helper permits "unsafe casts" for shapes, equivalent to using
-    `tf.Tensor.set_shape()` where domain-specific knowledge is available.
-
-    Args:
-      dataset: A `Dataset` object.
-      output_types: A nested structure of `tf.DType` objects.
-      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects.
-        If omitted, the shapes will be inherited from `dataset`.
-      output_classes: (Optional.) A nested structure of class types. If omitted,
-        the class types will be inherited from `dataset`.
-      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
-        reported output types and shapes of the restructured dataset, e.g. to
-        switch a sparse tensor represented as `tf.variant` to its user-visible
-        type and shape.
-
-    Raises:
-      ValueError: If either `output_types` or `output_shapes` is not compatible
-        with the structure of `dataset`.
-    """
-    self._input_dataset = dataset
-
-    input_types = dataset_ops.get_legacy_output_types(dataset)
-    if not allow_unsafe_cast:
-      # Validate that the types are compatible.
-      output_types = nest.map_structure(dtypes.as_dtype, output_types)
-      flat_original_types = nest.flatten(input_types)
-      flat_new_types = nest.flatten(output_types)
-      if flat_original_types != flat_new_types:
-        raise ValueError(
-            "Dataset with output types %r cannot be restructured to have "
-            "output types %r" %
-            (dataset_ops.get_legacy_output_types(dataset), output_types))
-
-    input_shapes = dataset_ops.get_legacy_output_shapes(dataset)
-    if output_shapes is None:
-      # Inherit shapes from the original `dataset`.
-      output_shapes = nest.pack_sequence_as(
-          output_types, nest.flatten(input_shapes))
-    else:
-      if not allow_unsafe_cast:
-        # Validate that the shapes are compatible.
-        nest.assert_same_structure(output_types, output_shapes)
-        flat_original_shapes = nest.flatten(input_shapes)
-        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
-
-        for original_shape, new_shape in zip(flat_original_shapes,
-                                             flat_new_shapes):
-          if not original_shape.is_compatible_with(new_shape):
-            raise ValueError(
-                "Dataset with output shapes %r cannot be restructured to have "
-                "incompatible output shapes %r" % (input_shapes,
-                                                   output_shapes))
-      output_shapes = nest.map_structure_up_to(
-          output_types, tensor_shape.as_shape, output_shapes)
-
-    input_classes = dataset_ops.get_legacy_output_classes(dataset)
-    if output_classes is None:
-      # Inherit class types from the original `dataset`.
-      output_classes = nest.pack_sequence_as(
-          output_types, nest.flatten(input_classes))
-
-    self._structure = structure.convert_legacy_structure(
-        output_types, output_shapes, output_classes)
-    variant_tensor = self._input_dataset._variant_tensor  # pylint: disable=protected-access
-    super(_RestructuredDataset, self).__init__(dataset, variant_tensor)
-
-  @property
-  def _element_structure(self):
-    return self._structure
-
-
-class _UnbatchDataset(dataset_ops.UnaryDataset):
-  """A dataset that splits the elements of its input into multiple elements."""
-
-  def __init__(self, input_dataset):
-    """See `unbatch()` for more details."""
-    input_shapes = dataset_ops.get_legacy_output_shapes(input_dataset)
-    flat_shapes = nest.flatten(input_shapes)
-    if any(s.ndims == 0 for s in flat_shapes):
-      raise ValueError("Cannot unbatch an input with scalar components.")
-    known_batch_dim = tensor_shape.Dimension(None)
-    for s in flat_shapes:
-      try:
-        known_batch_dim = known_batch_dim.merge_with(s[0])
-      except ValueError:
-        raise ValueError("Cannot unbatch an input whose components have "
-                         "different batch sizes.")
-    self._input_dataset = input_dataset
-
-    self._structure = dataset_ops.get_structure(input_dataset)._unbatch()  # pylint: disable=protected-access
-
-    variant_tensor = ged_ops.experimental_unbatch_dataset(
-        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
-    super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
-
-  @property
-  def _element_structure(self):
-    return self._structure
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 723eb7e831a..006fec98bd4 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -23,21 +23,43 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
+COMPRESSION_GZIP = "GZIP"
+COMPRESSION_NONE = None
+
+
 class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A Dataset that captures a snapshot or reads from a snapshot."""
 
-  def __init__(self, input_dataset, path):
+  def __init__(self,
+               input_dataset,
+               path,
+               compression=None,
+               reader_path_prefix=None,
+               writer_path_prefix=None):
+
+    self._compression = compression if compression is not None else ""
+    self._reader_path_prefix = (
+        reader_path_prefix if reader_path_prefix is not None else "")
+    self._writer_path_prefix = (
+        writer_path_prefix if writer_path_prefix is not None else "")
+
     self._input_dataset = input_dataset
     self._path = ops.convert_to_tensor(path, dtype=dtypes.string, name="path")
 
     variant_tensor = ged_ops.snapshot_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         path=self._path,
+        compression=self._compression,
+        reader_path_prefix=self._reader_path_prefix,
+        writer_path_prefix=self._writer_path_prefix,
         **dataset_ops.flat_structure(self))
     super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
 
 
-def snapshot(path):
+def snapshot(path,
+             compression=None,
+             reader_path_prefix=None,
+             writer_path_prefix=None):
   """Writes to/reads from a snapshot of a dataset.
 
   This function attempts to determine whether a valid snapshot exists at the
@@ -48,6 +70,12 @@ def snapshot(path):
   Args:
     path: A directory where we want to save our snapshots and/or read from a
       previously saved snapshot.
+    compression: The type of compression to apply to the Dataset. Currently
+      supports "GZIP" or None. Defaults to None (no compression).
+    reader_path_prefix: A prefix to add to the path when reading from snapshots.
+      Defaults to None.
+    writer_path_prefix: A prefix to add to the path when writing to snapshots.
+      Defaults to None.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -55,6 +83,7 @@ def snapshot(path):
   """
 
   def _apply_fn(dataset):
-    return _SnapshotDataset(dataset, path)
+    return _SnapshotDataset(dataset, path, compression, reader_path_prefix,
+                            writer_path_prefix)
 
   return _apply_fn
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 088e76922fb..8647d7e421b 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -583,6 +582,7 @@ cuda_py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -715,6 +715,31 @@ py_library(
     ],
 )
 
+tf_py_test(
+    name = "unbatch_test",
+    size = "medium",
+    srcs = ["unbatch_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/ops/ragged",
+    ],
+)
+
 tf_py_test(
     name = "window_test",
     size = "medium",
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
index 69b3d9ffbfa..214ea508c56 100644
--- a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -251,6 +251,21 @@ class FromTensorSlicesTest(test_base.DatasetTestBase):
     with self.assertRaises(errors.OutOfRangeError):
       self.evaluate(get_next())
 
+  def testFromTensorSlicesWithUintDtypes(self):
+    components = (
+        np.tile(np.array([[0], [1]], dtype=np.uint8), 2),
+        np.tile(np.array([[2], [256]], dtype=np.uint16), 2),
+        np.tile(np.array([[4], [65536]], dtype=np.uint32), 2),
+        np.tile(np.array([[8], [4294967296]], dtype=np.uint64), 2),
+    )
+    expected_types = (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
+    expected_output = [tuple([c[i] for c in components]) for i in range(2)]
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    self.assertEqual(expected_types,
+                     dataset_ops.get_legacy_output_types(dataset))
+    self.assertDatasetProduces(dataset, expected_output)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/reduce_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
index a2970361640..dc42a59132c 100644
--- a/tensorflow/python/data/kernel_tests/reduce_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -40,8 +42,7 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSum(self):
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1)
-      result = ds.reduce(
-          constant_op.constant(0, dtype=dtypes.int64), lambda x, y: x + y)
+      result = ds.reduce(np.int64(0), lambda x, y: x + y)
       self.assertEqual(((i + 1) * i) // 2, self.evaluate(result))
 
   def testSumTuple(self):
@@ -71,8 +72,7 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(((i + 1) * i) // 2, s)
       self.assertEqual(i, c)
 
-  # NOTE: This test is specific to graph mode and is skipped in eager mode.
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("graph-mode specific test")
   def testSkipEagerSquareUsingPlaceholder(self):
     delta = array_ops.placeholder(dtype=dtypes.int64)
 
@@ -206,6 +206,19 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
       result = ds.reduce(state, reduce_fn)
       self.assertEqual(((i + 1) * i) // 2, self.evaluate(result))
 
+  @test_util.run_v1_only("graph-mode specific test")
+  def testSkipEagerCancellation(self):
+    ds = dataset_ops.Dataset.from_tensors(1).repeat()
+    result = ds.reduce(0, lambda x, y: x + y)
+    with self.cached_session() as sess:
+      # The `result` op is guaranteed to not complete before cancelled because
+      # the dataset that is being reduced is infinite.
+      thread = self.checkedThread(self.assert_op_cancelled, args=(result,))
+      thread.start()
+      time.sleep(0.2)
+      sess.close()
+      thread.join()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 07ea2176297..bf408b29cac 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -43,6 +43,10 @@ class DatasetTestBase(ragged_test_util.RaggedTensorTestCase, test.TestCase):
     else:
       dataset_ops.Dataset = dataset_ops.DatasetV1
 
+  def assert_op_cancelled(self, op):
+    with self.assertRaisesRegexp(errors.CancelledError, "was cancelled"):
+      self.evaluate(op)
+
   def assertSparseValuesEqual(self, a, b):
     """Asserts that two SparseTensors/SparseTensorValues are equal."""
     self.assertAllEqual(a.indices, b.indices)
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/kernel_tests/unbatch_test.py
similarity index 85%
rename from tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
rename to tensorflow/python/data/kernel_tests/unbatch_test.py
index 22a9b9c8d60..16d6d8b8ebf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/kernel_tests/unbatch_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -41,8 +40,7 @@ from tensorflow.python.util import compat
 class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testUnbatchWithUnknownRankInput(self):
-    dataset = dataset_ops.Dataset.from_tensors([0, 1, 2,
-                                                3]).apply(batching.unbatch())
+    dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3]).unbatch()
     self.assertDatasetProduces(dataset, range(4))
 
   def testUnbatchScalarDataset(self):
@@ -51,7 +49,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     expected_types = (dtypes.int32,) * 3
     data = data.batch(2)
     self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
 
     self.assertDatasetProduces(data, [(i,) * 3 for i in range(10)])
@@ -63,7 +61,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
     data = data.batch(2)
     self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
 
     self.assertDatasetProduces(
@@ -75,9 +73,9 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
         values=list(range(10)),
         dense_shape=[10, 10])
     data = dataset_ops.Dataset.from_tensors(st)
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     data = data.batch(5)
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     expected_output = [
         sparse_tensor.SparseTensorValue([[i]], [i], [10]) for i in range(10)
     ]
@@ -91,9 +89,9 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     rt = ragged_factory_ops.constant_value([[[0]], [[1]], [[2]], [[3]], [[4]],
                                             [[5]], [[6]], [[7]], [[8]], [[9]]])
     data = dataset_ops.Dataset.from_tensors((list(range(10)), st, rt))
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     data = data.batch(5)
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     expected_output = [(i, sparse_tensor.SparseTensorValue([[i]], [i], [10]),
                         ragged_factory_ops.constant_value([[i]]))
                        for i in range(10)]
@@ -104,10 +102,10 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     rt = ragged_factory_ops.constant_value([[[0]], [[1]], [[2]], [[3]], [[4]],
                                             [[5]], [[6]], [[7]], [[8]], [[9]]])
     data = dataset_ops.Dataset.from_tensors(rt)
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     data = data.batch(5)
     data = data.batch(2)
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     expected_output = [
         ragged_factory_ops.constant_value([[[0]], [[1]], [[2]], [[3]], [[4]]]),
         ragged_factory_ops.constant_value([[[5]], [[6]], [[7]], [[8]], [[9]]]),
@@ -121,7 +119,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     expected_types = ((dtypes.int32,),) * 3
     data = data.batch(2)
     self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
 
     self.assertDatasetProduces(data, [((i,),) * 3 for i in range(10)])
@@ -134,7 +132,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.batch(2)
     self.assertAllEqual(expected_types,
                         dataset_ops.get_legacy_output_types(data))
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     self.assertAllEqual(expected_types,
                         dataset_ops.get_legacy_output_types(data))
 
@@ -146,14 +144,14 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = dataset_ops.Dataset.from_tensors(
         (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
          constant_op.constant([], shape=[0, 4, 0])))
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     self.assertDatasetProduces(data, [])
 
   def testUnbatchStaticShapeMismatch(self):
     data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
                                              np.arange(9)))
     with self.assertRaises(ValueError):
-      data.apply(batching.unbatch())
+      data.unbatch()
 
   # Note: dynamic shape mismatch is graph specific test.
   @test_util.run_deprecated_v1
@@ -161,7 +159,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
     ph2 = array_ops.placeholder(dtypes.int32, shape=None)
     data = dataset_ops.Dataset.from_tensors((ph1, ph2))
-    data = data.apply(batching.unbatch())
+    data = data.unbatch()
     iterator = dataset_ops.make_initializable_iterator(data)
     next_element = iterator.get_next()
 
@@ -186,6 +184,24 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(next_element)
 
+  def testUnbatchDatasetWithUintDtypes(self):
+    components = (
+        np.tile(np.array([[0], [1], [2], [3]], dtype=np.uint8), 2),
+        np.tile(np.array([[1], [2], [3], [256]], dtype=np.uint16), 2),
+        np.tile(np.array([[2], [3], [4], [65536]], dtype=np.uint32), 2),
+        np.tile(np.array([[3], [4], [5], [4294967296]], dtype=np.uint64), 2),
+    )
+    expected_types = (dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64)
+    expected_output = [tuple([c[i] for c in components]) for i in range(4)]
+
+    data = dataset_ops.Dataset.from_tensor_slices(components)
+    data = data.batch(2)
+    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
+
+    data = data.unbatch()
+    self.assertEqual(expected_types, dataset_ops.get_legacy_output_types(data))
+    self.assertDatasetProduces(data, expected_output)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 4ce35436282..6ece7cc2f01 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 086c63d1976..11cfdef748d 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -282,9 +282,14 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
       An `Iterator` over the elements of this dataset.
 
     Raises:
-      RuntimeError: If eager execution is not enabled.
+      RuntimeError: If not inside of tf.function and not executing eagerly.
     """
-    return iterator_ops.IteratorV2(self)
+    if (context.executing_eagerly()
+        or ops.get_default_graph()._building_function):  # pylint: disable=protected-access
+      return iterator_ops.IteratorV2(self)
+    else:
+      raise RuntimeError("__iter__() is only supported inside of tf.function "
+                         "or when eager execution is enabled.")
 
   @abc.abstractproperty
   def _element_structure(self):
@@ -699,8 +704,11 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   def prefetch(self, buffer_size):
     """Creates a `Dataset` that prefetches elements from this dataset.
 
-    Note that if the dataset was batched using `Dataset.batch`, each element is
-    a batch and this operation will prefetch `buffer_size` batches.
+    Note: Like other `Dataset` methods, prefetch operates on the
+    elements of the input dataset. It has no concept of examples vs. batches.
+    `examples.prefetch(2)` will prefetch two elements (2 examples),
+    while `examples.batch(20).prefetch(2)` will prefetch 2 elements
+    (2 batches, of 20 examples each).
 
     Args:
       buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the maximum
@@ -1452,6 +1460,54 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
             output_shapes=state_structure._flat_shapes,
             output_types=state_structure._flat_types))
 
+  def unbatch(self):
+    """Splits elements of a dataset into multiple elements.
+
+    For example, if elements of the dataset are shaped `[B, a0, a1, ...]`,
+    where `B` may vary for each input element, then for each element in the
+    dataset, the unbatched dataset will contain `B` consecutive elements
+    of shape `[a0, a1, ...]`.
+
+    ```python
+    # NOTE: The following example uses `{ ... }` to represent the contents
+    # of a dataset.
+    ds = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+    ds.unbatch() == {'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'}
+    ```
+
+    Returns:
+      A `Dataset` transformation function, which can be passed to
+      `tf.data.Dataset.apply`.
+    """
+
+    # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
+    # are normalized to the rank-1 dense representation, so that the
+    # sparse-oblivious unbatching logic will slice them
+    # appropriately. This leads to a somewhat inefficient re-encoding step
+    # for all SparseTensor components.
+    # TODO(mrry): Consider optimizing this in future if it turns out to be
+    # a bottleneck.
+    def normalize(arg, *rest):
+      # pylint: disable=protected-access
+      if rest:
+        return self._element_structure._to_batched_tensor_list((arg,) + rest)
+      else:
+        return self._element_structure._to_batched_tensor_list(arg)
+
+    normalized_dataset = self.map(normalize)
+
+    # NOTE(mrry): Our `map()` has lost information about the sparseness
+    # of any SparseTensor components, so re-apply the structure of the
+    # original dataset.
+    restructured_dataset = _RestructuredDataset(
+        normalized_dataset,
+        get_legacy_output_types(self),
+        get_legacy_output_shapes(self),
+        get_legacy_output_classes(self),
+        allow_unsafe_cast=True)
+    return _UnbatchDataset(restructured_dataset)
+
   def with_options(self, options):
     """Returns a new `tf.data.Dataset` with the given options set.
 
@@ -3556,3 +3612,120 @@ class _PrivateThreadPoolDataset(UnaryUnchangedStructureDataset):
         **flat_structure(self))
     super(_PrivateThreadPoolDataset, self).__init__(input_dataset,
                                                     variant_tensor)
+
+
+class _RestructuredDataset(UnaryDataset):
+  """An internal helper for changing the structure and shape of a dataset."""
+
+  def __init__(self,
+               dataset,
+               output_types,
+               output_shapes=None,
+               output_classes=None,
+               allow_unsafe_cast=False):
+    """Creates a new dataset with the given output types and shapes.
+
+    The given `dataset` must have a structure that is convertible:
+    * `dataset.output_types` must be the same as `output_types` module nesting.
+    * Each shape in `dataset.output_shapes` must be compatible with each shape
+      in `output_shapes` (if given).
+
+    Note: This helper permits "unsafe casts" for shapes, equivalent to using
+    `tf.Tensor.set_shape()` where domain-specific knowledge is available.
+
+    Args:
+      dataset: A `Dataset` object.
+      output_types: A nested structure of `tf.DType` objects.
+      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects.
+        If omitted, the shapes will be inherited from `dataset`.
+      output_classes: (Optional.) A nested structure of class types. If omitted,
+        the class types will be inherited from `dataset`.
+      allow_unsafe_cast: (Optional.) If `True`, the caller may switch the
+        reported output types and shapes of the restructured dataset, e.g. to
+        switch a sparse tensor represented as `tf.variant` to its user-visible
+        type and shape.
+
+    Raises:
+      ValueError: If either `output_types` or `output_shapes` is not compatible
+        with the structure of `dataset`.
+    """
+    self._input_dataset = dataset
+
+    input_types = get_legacy_output_types(dataset)
+    if not allow_unsafe_cast:
+      # Validate that the types are compatible.
+      output_types = nest.map_structure(dtypes.as_dtype, output_types)
+      flat_original_types = nest.flatten(input_types)
+      flat_new_types = nest.flatten(output_types)
+      if flat_original_types != flat_new_types:
+        raise ValueError(
+            "Dataset with output types %r cannot be restructured to have "
+            "output types %r" %
+            (get_legacy_output_types(dataset), output_types))
+
+    input_shapes = get_legacy_output_shapes(dataset)
+    if output_shapes is None:
+      # Inherit shapes from the original `dataset`.
+      output_shapes = nest.pack_sequence_as(
+          output_types, nest.flatten(input_shapes))
+    else:
+      if not allow_unsafe_cast:
+        # Validate that the shapes are compatible.
+        nest.assert_same_structure(output_types, output_shapes)
+        flat_original_shapes = nest.flatten(input_shapes)
+        flat_new_shapes = nest.flatten_up_to(output_types, output_shapes)
+
+        for original_shape, new_shape in zip(flat_original_shapes,
+                                             flat_new_shapes):
+          if not original_shape.is_compatible_with(new_shape):
+            raise ValueError(
+                "Dataset with output shapes %r cannot be restructured to have "
+                "incompatible output shapes %r" % (input_shapes,
+                                                   output_shapes))
+      output_shapes = nest.map_structure_up_to(
+          output_types, tensor_shape.as_shape, output_shapes)
+
+    input_classes = get_legacy_output_classes(dataset)
+    if output_classes is None:
+      # Inherit class types from the original `dataset`.
+      output_classes = nest.pack_sequence_as(
+          output_types, nest.flatten(input_classes))
+
+    self._structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
+    variant_tensor = self._input_dataset._variant_tensor  # pylint: disable=protected-access
+    super(_RestructuredDataset, self).__init__(dataset, variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
+
+
+class _UnbatchDataset(UnaryDataset):
+  """A dataset that splits the elements of its input into multiple elements."""
+
+  def __init__(self, input_dataset):
+    """See `unbatch()` for more details."""
+    input_shapes = get_legacy_output_shapes(input_dataset)
+    flat_shapes = nest.flatten(input_shapes)
+    if any(s.ndims == 0 for s in flat_shapes):
+      raise ValueError("Cannot unbatch an input with scalar components.")
+    known_batch_dim = tensor_shape.Dimension(None)
+    for s in flat_shapes:
+      try:
+        known_batch_dim = known_batch_dim.merge_with(s[0])
+      except ValueError:
+        raise ValueError("Cannot unbatch an input whose components have "
+                         "different batch sizes.")
+    self._input_dataset = input_dataset
+
+    self._structure = get_structure(input_dataset)._unbatch()  # pylint: disable=protected-access
+
+    variant_tensor = ged_ops.experimental_unbatch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        **flat_structure(self))
+    super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 2524b27c598..9aa5e39ffce 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 69b9956441e..caf889cefdf 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -10,10 +10,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
@@ -1071,7 +1070,6 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
-    data = [":grpc_tensorflow_server.par"],
     grpc_enabled = True,
     tags = [
         "no_oss",  # Incompatible with bazel_pip.
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index e50e99dff94..554bb3eaafd 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
@@ -72,6 +71,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:device_lib",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:kernels",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
@@ -216,6 +216,9 @@ py_test(
     srcs = ["distribute_coordinator_test.py"],
     python_version = "PY2",
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":distribute_coordinator",
         "//tensorflow/core:protos_all_py",
@@ -608,9 +611,9 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:distributed_framework_test_lib",
+        "//tensorflow/python:keras_lib",
         "//tensorflow/python:session",
         "//tensorflow/python:util",
-        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
     ],
 )
@@ -631,6 +634,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_xla_py_test(
@@ -657,7 +661,11 @@ distribute_py_test(
     name = "input_lib_test",
     srcs = ["input_lib_test.py"],
     main = "input_lib_test.py",
+    tags = [
+        "no_windows",
+    ],
     deps = [
+        ":collective_all_reduce_strategy",
         ":mirrored_strategy",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -686,6 +694,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -710,6 +719,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -722,6 +732,7 @@ cuda_py_test(
         "//tensorflow/python/eager:test",
     ],
     grpc_enabled = True,
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -904,6 +915,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -935,6 +947,7 @@ cuda_py_test(
         "multi_and_single_gpu",
         "no_windows_gpu",  # TODO(b/130551176)
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 distribute_py_test(
@@ -1018,3 +1031,22 @@ distribute_py_test(
         "//tensorflow/python/keras:saving",
     ],
 )
+
+distribute_py_test(
+    name = "ctl_correctness_test",
+    srcs = ["ctl_correctness_test.py"],
+    main = "ctl_correctness_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow/python:keras_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:util",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:test",
+    ],
+)
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index 360a2993cd9..c4341ca8396 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -6,10 +6,9 @@ package(
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 py_library(
     name = "cluster_resolver_lib",
     srcs = [
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 17ad673e665..1cc7d01f115 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -122,6 +122,17 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._is_chief = True
     self._num_workers = 1
 
+    if ops.executing_eagerly_outside_functions():
+      try:
+        context.context().configure_collective_ops(
+            scoped_allocator_enabled_ops=("CollectiveReduce",),
+            use_nccl_communication=(self._communication == cross_device_ops_lib
+                                    .CollectiveCommunication.NCCL))
+      except RuntimeError:
+        logging.warning("Collective ops is not configured at program startup. "
+                        "Some performance features may not be enabled.")
+      self._collective_ops_configured = True
+
     # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
     # some cases.
     if isinstance(cluster_resolver, TFConfigClusterResolver):
@@ -157,22 +168,13 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     # Save the num_gpus_per_worker and rpc_layer for configure method.
     self._num_gpus_per_worker = num_gpus
     self._rpc_layer = cluster_resolver.rpc_layer
+    self._warn_nccl_no_gpu()
 
-    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
-                 local_devices)
+    logging.info("Single-worker CollectiveAllReduceStrategy with local_devices "
+                 "= %r, communication = %s", local_devices, self._communication)
 
   def _initialize_multi_worker(self, cluster_resolver):
     """Initializes the object for multi-worker training."""
-    # TODO(yuefengz): The `num_gpus` is only for this particular task. It
-    # assumes all workers have the same number of GPUs. We should remove this
-    # assumption by querying all tasks for their numbers of GPUs.
-    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
-    # some cases.
-    if isinstance(cluster_resolver, TFConfigClusterResolver):
-      num_gpus = context.num_gpus()
-    else:
-      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
-
     cluster_spec = multi_worker_util.normalize_cluster_spec(
         cluster_resolver.cluster_spec())
     task_type = cluster_resolver.task_type
@@ -180,6 +182,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     if task_type is None or task_id is None:
       raise ValueError("When `cluster_spec` is given, you must also specify "
                        "`task_type` and `task_id`.")
+    self._cluster_spec = cluster_spec
+    self._task_type = task_type
+    self._task_id = task_id
 
     self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
     if not self._num_workers:
@@ -191,6 +196,51 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
 
     self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
     self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
+
+    if (ops.executing_eagerly_outside_functions() and
+        not getattr(self, "_local_or_standalone_client_mode", False)):
+      context.context().configure_collective_ops(
+          collective_leader=multi_worker_util.collective_leader(
+              cluster_spec, task_type, task_id),
+          scoped_allocator_enabled_ops=("CollectiveReduce",),
+          use_nccl_communication=(self._communication == cross_device_ops_lib
+                                  .CollectiveCommunication.NCCL),
+          device_filters=("/job:%s/task:%d" % (task_type, task_id),))
+      self._collective_ops_configured = True
+
+    # Starting a std server in eager mode and in independent worker mode.
+    if (context.executing_eagerly() and
+        not getattr(self, "_std_server_started", False) and
+        not getattr(self, "_local_or_standalone_client_mode", False)):
+      # Checking _local_or_standalone_client_mode as well because we should not
+      # create the std server in standalone client mode.
+      config_proto = config_pb2.ConfigProto()
+      config_proto = self._update_config_proto(config_proto)
+      server_def = tensorflow_server_pb2.ServerDef(
+          cluster=cluster_spec.as_cluster_def(),
+          default_session_config=config_proto,
+          job_name=task_type,
+          task_index=task_id,
+          protocol=cluster_resolver.rpc_layer or "grpc")
+      context.context().enable_collective_ops(server_def)
+      self._std_server_started = True
+      # The `ensure_initialized` is needed before calling
+      # `context.context().devices()`.
+      context.context().ensure_initialized()
+      logging.info(
+          "Enabled multi-worker collective ops with available devices: %r",
+          context.context().devices())
+
+    # TODO(yuefengz): The `num_gpus` is only for this particular task. It
+    # assumes all workers have the same number of GPUs. We should remove this
+    # assumption by querying all tasks for their numbers of GPUs.
+    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
+    # some cases.
+    if isinstance(cluster_resolver, TFConfigClusterResolver):
+      num_gpus = context.num_gpus()
+    else:
+      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
+
     if num_gpus:
       local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                             for i in range(num_gpus))
@@ -210,13 +260,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     # on other workers.
     self._default_device = "/job:%s/task:%d" % (task_type, task_id)
 
-    self._cluster_spec = cluster_spec
-    self._task_type = task_type
-    self._task_id = task_id
-
     # Save the num_gpus_per_worker and rpc_layer for configure method.
     self._num_gpus_per_worker = num_gpus
     self._rpc_layer = cluster_resolver.rpc_layer
+    self._warn_nccl_no_gpu()
 
     logging.info(
         "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, "
@@ -225,25 +272,6 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         task_id, self._num_workers, local_devices,
         self._communication)
 
-    if (context.executing_eagerly() and
-        not getattr(self, "_std_server_started", False) and
-        not getattr(self, "_local_or_standalone_client_mode", False)):
-      # Checking _local_or_standalone_client_mode as well because we should not
-      # create the std server in standalone client mode.
-      config_proto = config_pb2.ConfigProto()
-      config_proto = self._update_config_proto(config_proto)
-      server_def = tensorflow_server_pb2.ServerDef(
-          cluster=cluster_spec.as_cluster_def(),
-          default_session_config=config_proto,
-          job_name=task_type,
-          task_index=task_id,
-          protocol=cluster_resolver.rpc_layer or "grpc")
-      context.context().enable_collective_ops(server_def)
-      self._std_server_started = True
-      logging.info(
-          "Enabled multi-worker collective ops with available devices: %r",
-          context.context().devices())
-
   def _create_variable(self, next_creator, *args, **kwargs):
     colocate_with = kwargs.pop("colocate_with", None)
     if colocate_with is None:
@@ -426,9 +454,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     del rewrite_options.scoped_allocator_opts.enable_op[:]
     rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
 
-    if ((self._communication ==
-         cross_device_ops_lib.CollectiveCommunication.NCCL) and
-        self._num_gpus_per_worker > 0):
+    if self._communication == cross_device_ops_lib.CollectiveCommunication.NCCL:
       updated_config.experimental.collective_nccl = True
 
     if not self._cluster_spec:
@@ -473,6 +499,13 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     return self._get_cross_device_ops().reduce(
         reduce_op, value, destinations=destinations)
 
+  def _warn_nccl_no_gpu(self):
+    if ((self._communication ==
+         cross_device_ops_lib.CollectiveCommunication.NCCL) and
+        self._num_gpus_per_worker == 0):
+      logging.warning("Enabled NCCL communication but no GPUs detected/"
+                      "specified.")
+
   @property
   def experimental_between_graph(self):
     return True
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index 6e264de7278..61fc15123b0 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -166,6 +166,8 @@ def _augment_with_special_arguments(test_method):
     for arg in requested_arguments:
       if arg == "self":
         kwargs_to_pass[arg] = self
+      elif arg == "mode":
+        kwargs_to_pass[arg] = mode
       else:
         kwargs_to_pass[arg] = kwargs[arg]
 
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index fb223843988..453848db6a7 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
+from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -231,6 +232,11 @@ class CrossDeviceOps(object):
   def __init__(self):
     pass
 
+  @property
+  def _num_between_graph_workers(self):
+    # Returns 1 by default, the value may be overridden by sub classes.
+    return 1
+
   def reduce(self, reduce_op, per_replica_value, destinations):
     """Reduce `per_replica_value` to `destinations`.
 
@@ -254,6 +260,14 @@ class CrossDeviceOps(object):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
+
+    # Shortcut if `per_replica_value` only contains one value.
+    if self._num_between_graph_workers == 1 and len(
+        per_replica_value.values) == 1 and _devices_match(
+            per_replica_value, destinations):
+      return value_lib.Mirrored(per_replica_value.device_map,
+                                per_replica_value.values)
+
     return self.reduce_implementation(reduce_op, per_replica_value,
                                       destinations)
 
@@ -287,6 +301,15 @@ class CrossDeviceOps(object):
     for _, d in value_destination_pairs:
       validate_destinations(d)
 
+    # Shortcut all PerReplica objects only contain one value.
+    if self._num_between_graph_workers == 1 and _all_devices_match(
+        value_destination_pairs) and len(
+            value_destination_pairs[0][0].values) == 1:
+      return [
+          value_lib.Mirrored(v.device_map, v.values)
+          for v, _ in value_destination_pairs
+      ]
+
     return self.batch_reduce_implementation(reduce_op, value_destination_pairs)
 
   def broadcast(self, tensor, destinations):
@@ -713,7 +736,7 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
   def _do_batch_all_reduce(self, reduce_op, dense_values):
     """Run batch all-reduces."""
     logging.log_first_n(
-        logging.INFO, "batch_all_reduce: %d all-reduces with algorithm = %s,"
+        logging.INFO, "batch_all_reduce: %d all-reduces with algorithm = %s, "
         "num_packs = %d, agg_small_grads_max_bytes = %d and "
         "agg_small_grads_max_group = %d" %
         (len(dense_values), self._all_reduce_alg, self._num_packs,
@@ -973,6 +996,10 @@ class CollectiveAllReduce(CrossDeviceOps):
                              cross_device_utils.CollectiveKeys())
     super(CollectiveAllReduce, self).__init__()
 
+  @property
+  def _num_between_graph_workers(self):
+    return self._num_workers
+
   def reduce_implementation(self, reduce_op, per_replica_value, destinations):
     all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     device_map, logical_device = get_device_map_from(destinations)
@@ -1027,10 +1054,6 @@ class CollectiveAllReduce(CrossDeviceOps):
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All reduce algorithm in a batch."""
-    logging.log_first_n(
-        logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
-        "num_workers = %d" % (len(per_replica_values), self._num_workers), 10)
-
     dense_values, dense_indices, sparse_values, sparse_indices = (
         cross_device_utils.split_by_sparsity(per_replica_values))
     if dense_values:
@@ -1127,33 +1150,8 @@ class CollectiveAllReduce(CrossDeviceOps):
         num_between_graph_workers=self._num_workers)
 
 
-_dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
-               [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
-
-
-def _has_dgx1_like_links(gpu_links):
-  if not gpu_links:
-    return False
-  # TODO(yuefengz): figure out the right topology for hierarchical copy if
-  # number of gpus are less than 8.
-  if len(gpu_links) < 8:
-    return False
-  for i, (gpu_link, dgx1_link) in enumerate(zip(gpu_links, _dgx1_links)):
-    if (set(gpu_link) != set(dgx1_link) and
-        set(gpu_link) != set(dgx1_link + [i])):
-      return False
-  return True
-
-
-def _choose_all_reduce_algorithm(device_links):
-  if _has_dgx1_like_links(device_links):
-    return HierarchicalCopyAllReduce(num_packs=len(device_links))
-  else:
-    return NcclAllReduce(num_packs=1)
-
-
 def choose_the_best(devices, session_config=None):
-  """Find the best subclass of CrossDeviceOps given a session config.
+  """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`.
 
   Args:
     devices: a list of devices passed to `tf.distribute.Strategy`.
@@ -1165,27 +1163,24 @@ def choose_the_best(devices, session_config=None):
   """
   requested_devices = set([device_util.canonicalize(d) for d in devices])
   machine_devices = device_lib.list_local_devices(session_config=session_config)
-  using_devices = []
+  using_devices = set()
   for d in machine_devices:
     if device_util.canonicalize(d.name) in requested_devices:
-      using_devices.append(d)
-    else:
-      logging.info(
-          "Device is available but not used by distribute strategy: %s", d.name)
+      using_devices.add(d.name)
 
   if len(using_devices) != len(requested_devices):
-    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
-                    "to TensorFlow.")
+    logging.warning(
+        "Some requested devices in `tf.distribute.Strategy` are not visible "
+        "to TensorFlow: %s", ",".join(list(requested_devices - using_devices)))
     return ReductionToOneDevice()
 
-  if any(d.device_type.lower() != "gpu" for d in using_devices):
-    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
-                    "to TensorFlow.")
+  if any("gpu" not in d.lower() for d in using_devices):
+    logging.warning("There is non-GPU devices in `tf.distribute.Strategy`, not "
+                    "using nccl allreduce.")
     return ReductionToOneDevice()
 
-  device_links = [[] for _ in range(len(using_devices))]
-  for i, device in enumerate(using_devices):
-    for link in device.locality.links.link:
-      device_links[i].append(link.device_id)
-
-  return _choose_all_reduce_algorithm(device_links)
+  if kernels.get_registered_kernels_for_op("NcclAllReduce"):
+    return NcclAllReduce(num_packs=1)
+  else:
+    logging.warning("Nccl kernel is not found, not using nccl allreduce.")
+    return ReductionToOneDevice()
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index 6539fe0c682..e269ef4da3c 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
+from tensorflow.python.framework import kernels
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -288,36 +289,41 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     self._testReductionAndBroadcast(cross_device_ops, devices)
 
   def testChooseAlgorithm(self):
-    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
-                    [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
-    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
-    self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
-    self.assertEqual(result._num_packs, 8)
+    # Not use nccl if there is any cpu device.
+    self.assertIsInstance(
+        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
+        cross_device_ops_lib.ReductionToOneDevice)
 
-    # if there are only 4 devices
-    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
-    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
-    self.assertEqual(result._all_reduce_alg, "nccl")
-    self.assertEqual(result._num_packs, 1)
+    # Not use nccl if requested device is not visible to TensorFlow.
+    self.assertIsInstance(
+        cross_device_ops_lib.choose_the_best(["/gpu:100"]),
+        cross_device_ops_lib.ReductionToOneDevice)
 
-    # if devices links contain each device itself
-    device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
-                    [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
-                    [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
-    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
-    self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
-    self.assertEqual(result._num_packs, 8)
+    if context.num_gpus() < 1:
+      return
+
+    devices = ["/gpu:0"]
+
+    def mock_get_registered_kernels_for_op(op):
+      if op == "NcclAllReduce":
+        return [object]
+      else:
+        return []
+
+    # Use nccl if nccl kernel is found.
+    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
+                                mock_get_registered_kernels_for_op):
+      self.assertIsInstance(
+          cross_device_ops_lib.choose_the_best(devices),
+          cross_device_ops_lib.NcclAllReduce)
+
+    # Not use nccl if nccl kernel is not found.
+    with test.mock.patch.object(kernels,
+                                "get_registered_kernels_for_op", lambda _: []):
+      self.assertIsInstance(
+          cross_device_ops_lib.choose_the_best(devices),
+          cross_device_ops_lib.ReductionToOneDevice)
 
-    # if not dgx1-like links
-    device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
-                    [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
-    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
-    self.assertEqual(result._all_reduce_alg, "nccl")
-    self.assertEqual(result._num_packs, 1)
 
   @combinations.generate(combinations.combine(
       mode=["graph", "eager"],
diff --git a/tensorflow/python/distribute/ctl_correctness_test.py b/tensorflow/python/distribute/ctl_correctness_test.py
new file mode 100644
index 00000000000..c0ace652297
--- /dev/null
+++ b/tensorflow/python/distribute/ctl_correctness_test.py
@@ -0,0 +1,272 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Custom Training Loop correctness test.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import math_ops
+
+_NUM_SAMPLES = 64
+_BATCH_SIZE = 32
+_RANDOM_SEED = 1337
+_NUM_EPOCHS = 2
+_STEPS_PER_EPOCH = 2
+
+
+class MaybeStrategyScope(object):
+  """Provides a context allowing no distribution strategy."""
+
+  def __init__(self, strategy):
+    self._strategy = strategy
+    self._scope = None
+
+  def __enter__(self):
+    if self._strategy:
+      self._scope = self._strategy.scope()
+      self._scope.__enter__()
+
+  def __exit__(self, exc_type, value, traceback):
+    if self._strategy:
+      self._scope.__exit__(exc_type, value, traceback)
+      self._scope = None
+
+
+def get_model():
+  model = keras.Sequential()
+  model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
+  model.add(keras.layers.Dense(
+      10, activation='relu',
+      kernel_regularizer=keras.regularizers.l2(1e-4)))
+  model.add(keras.layers.Dense(10, activation='relu'))
+  model.add(keras.layers.Dense(1))
+  return model
+
+
+def get_data():
+  x_train = np.random.rand(_NUM_SAMPLES, 1)
+  y_train = 3 * x_train
+  x_train = x_train.astype('float32')
+  y_train = y_train.astype('float32')
+  train_dataset = dataset_ops.DatasetV2.from_tensor_slices((x_train, y_train))
+  train_dataset = train_dataset.batch(_BATCH_SIZE)
+  return train_dataset
+
+
+def compute_loss(labels, logits, reg_losses):
+  pred_loss = keras.losses.mean_squared_error(labels, logits)
+  scaled_loss = math_ops.reduce_sum(pred_loss) * (1.0 / _BATCH_SIZE)
+  l2_loss = (math_ops.reduce_sum(reg_losses) *
+             (1.0 /
+              ds_context.get_replica_context().num_replicas_in_sync))
+  return scaled_loss + l2_loss
+
+
+def iteration_inside_func(initial_weights, dataset, optimizer_fn,
+                          iteration_type, strategy=None):
+  """Helper function to test iterating over data inside a tf.function."""
+  with MaybeStrategyScope(strategy):
+    model = get_model()
+    model.set_weights(initial_weights)
+    optimizer = optimizer_fn()
+
+    training_accuracy = keras.metrics.CategoricalAccuracy(
+        'training_accuracy', dtype=dtypes.float32)
+
+    @def_function.function
+    def train_epoch(dist_input):
+      """Training StepFn."""
+      def step_fn(inputs):
+        samples, labels = inputs
+        with backprop.GradientTape() as tape:
+          logits = model(samples)
+          loss = compute_loss(labels, logits, model.losses)
+        grads = tape.gradient(loss, model.trainable_variables)
+        optimizer.apply_gradients(zip(grads, model.trainable_variables))
+        training_accuracy.update_state(labels, logits)
+        return loss
+
+      total_loss = 0.0
+      num_batches = 0
+      if iteration_type == 'dataset':
+        for x in dist_input:
+          if strategy:
+            per_replica_losses = strategy.experimental_run_v2(step_fn,
+                                                              args=(x,))
+            total_loss += strategy.reduce(reduce_util.ReduceOp.SUM,
+                                          per_replica_losses,
+                                          axis=None)
+          else:
+            total_loss += step_fn(x)
+          num_batches += 1
+      else:
+        iterator = iter(dist_input)
+        for _ in range(_STEPS_PER_EPOCH):
+          if strategy:
+            per_replica_losses = strategy.experimental_run_v2(
+                step_fn, args=(next(iterator),))
+            total_loss += strategy.reduce(reduce_util.ReduceOp.SUM,
+                                          per_replica_losses,
+                                          axis=None)
+          else:
+            total_loss += step_fn(next(iterator))
+          num_batches += 1
+
+      return total_loss / math_ops.cast(num_batches, dtype=dtypes.float32)
+
+    if strategy:
+      dataset = strategy.experimental_distribute_dataset(dataset)
+
+    for _ in range(_NUM_EPOCHS):
+      loss = train_epoch(dataset)
+
+    return (model.get_weights(),
+            loss,
+            training_accuracy.result())
+
+
+def iteration_outside_func(initial_weights, dataset, optimizer_fn,
+                           iteration_type, strategy=None):
+  """Helper function to test iterating over data outside a tf.function."""
+  with MaybeStrategyScope(strategy):
+    model = get_model()
+    model.set_weights(initial_weights)
+    optimizer = optimizer_fn()
+
+    training_accuracy = keras.metrics.CategoricalAccuracy(
+        'training_accuracy', dtype=dtypes.float32)
+
+    @def_function.function
+    def train_step(dist_inputs):
+      """Training StepFn."""
+      def step_fn(inputs):
+        samples, labels = inputs
+        with backprop.GradientTape() as tape:
+          logits = model(samples)
+          loss = compute_loss(labels, logits, model.losses)
+        grads = tape.gradient(loss, model.trainable_variables)
+        optimizer.apply_gradients(zip(grads, model.trainable_variables))
+        training_accuracy.update_state(labels, logits)
+        return loss
+
+      if strategy:
+        per_replica_losses = strategy.experimental_run_v2(
+            step_fn, args=(dist_inputs,))
+        return strategy.reduce(reduce_util.ReduceOp.SUM,
+                               per_replica_losses,
+                               axis=None)
+      else:
+        return step_fn(dist_inputs)
+
+    if strategy:
+      dataset = strategy.experimental_distribute_dataset(dataset)
+
+    total_loss = 0.0
+    num_batches = 0
+    if iteration_type == 'dataset':
+      for _ in range(_NUM_EPOCHS):
+        for x in dataset:
+          total_loss += train_step(x)
+          num_batches += 1
+    else:
+      for _ in range(_NUM_EPOCHS):
+        iterator = iter(dataset)
+        for _ in range(_STEPS_PER_EPOCH):
+          total_loss += train_step(next(iterator))
+          num_batches += 1
+
+    return (model.get_weights(),
+            total_loss / math_ops.cast(num_batches, dtype=dtypes.float32),
+            training_accuracy.result())
+
+
+class TestDistributionStrategyDnnCorrectness(test.TestCase,
+                                             parameterized.TestCase):
+  """Test custom training loop correctness with a simple DNN model."""
+
+  def setUp(self):
+    super(TestDistributionStrategyDnnCorrectness, self).setUp()
+    v2_compat.enable_v2_behavior()
+    np.random.seed(_RANDOM_SEED)
+    random_seed.set_random_seed(_RANDOM_SEED)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.strategies_minus_tpu,
+          optimizer_fn=strategy_combinations.optimizers_v1_and_v2,
+          mode=['eager'],
+          iteration_type=['iterator', 'dataset'],
+          inside_func=[False, True]
+      ))
+  def test_dnn_correctness_minus_tpus(self, distribution, optimizer_fn,
+                                      iteration_type, inside_func):
+    self.dnn_correctness(distribution, optimizer_fn, iteration_type,
+                         inside_func)
+
+  # TODO(b/133325470): Enable this test for all optimizers once we understand
+  # the root cause of flakiness.
+  @combinations.generate(
+      combinations.combine(
+          distribution=[strategy_combinations.tpu_strategy_one_step],
+          optimizer_fn=[strategy_combinations.adagrad_optimizer_keras_v2_fn],
+          mode=['eager'],
+          iteration_type=['iterator', 'dataset'],
+          inside_func=[False, True]
+      ))
+  def test_dnn_correctness_tpus(self, distribution, optimizer_fn,
+                                iteration_type, inside_func):
+    # TODO(b/130734563): Enable this test combination once we figure out a way
+    # to iterate over a DistributedDataset inside a tf.function.
+    if iteration_type == 'dataset' and inside_func:
+      self.skipTest('Unsupported test combination.')
+    self.dnn_correctness(distribution, optimizer_fn, iteration_type,
+                         inside_func)
+
+  def dnn_correctness(self, distribution, optimizer_fn, iteration_type,
+                      inside_func):
+    model = get_model()
+    initial_weights = model.get_weights()
+    dataset = get_data()
+    if inside_func:
+      iteration_func = iteration_inside_func
+    else:
+      iteration_func = iteration_outside_func
+    wts_with_ds, loss_with_ds, acc_with_ds = iteration_func(
+        initial_weights, dataset, optimizer_fn, iteration_type,
+        strategy=distribution)
+    wts, loss, acc = iteration_func(initial_weights, dataset, optimizer_fn,
+                                    iteration_type)
+
+    self.assertAllClose(wts, wts_with_ds, atol=1e-3, rtol=1e-3)
+    self.assertAllClose(loss, loss_with_ds, atol=1e-3, rtol=1e-3)
+    self.assertAllClose(acc, acc_with_ds, atol=1e-3, rtol=1e-3)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index fce8acace7a..7044c08e949 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -446,7 +446,8 @@ class Strategy(object):
     `.shard` operation to the end of the processing pipeline. This will cause
     the entire preprocessing pipeline for all the data to be run on every
     worker, and each worker will do redundant work. We will print a warning
-    if this method of sharding is selected.
+    if this method of sharding is selected. In this case, consider using
+    `experimental_distribute_datasets_from_function` instead.
 
     You can disable dataset distribution using the `auto_shard` option in
     `tf.data.experimental.DistributeOptions`.
@@ -482,6 +483,52 @@ class Strategy(object):
     """
     return self._extended._experimental_distribute_dataset(dataset)  # pylint: disable=protected-access
 
+  def experimental_distribute_datasets_from_function(self, dataset_fn):
+    """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
+
+    `dataset_fn` will be called once for each worker in the strategy. Each
+    replica on that worker will dequeue one batch of inputs from the local
+    `Dataset` (i.e. if a worker has two replicas, two batches will be dequeued
+    from the `Dataset` every step).
+
+    This method can be used for several purposes. For example, where
+    `experimental_distribute_dataset` is unable to shard the input files, this
+    method might be used to manually shard the dataset (avoiding the slow
+    fallback behavior in `experimental_distribute_dataset`). In cases where the
+    dataset is infinite, this sharding can be done by creating dataset replicas
+    that differ only in their random seed.
+
+    The `dataset_fn` should take an `tf.distribute.InputContext` instance where
+    information about batching and input replication can be accessed:
+
+    ```
+    def dataset_fn(input_context):
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
+      return d.shard(
+          input_context.num_input_pipelines, input_context.input_pipeline_id)
+
+    inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+
+    for batch in inputs:
+      replica_results = strategy.experimental_run_v2(replica_fn, args=(batch,))
+    ```
+
+    IMPORTANT: The `Dataset` returned by `dataset_fn` should have a per-replica
+    batch size, unlike `distribute_dataset`, which uses the global batch size.
+    This may be computed using `input_context.get_per_replica_batch_size`.
+
+    Args:
+      dataset_fn: A function taking a `tf.distribute.InputContext` instance and
+        returning a `tf.data.Dataset`.
+
+    Returns:
+      A `DistributedDatasetsFromFunction` which returns inputs for each step of
+      the computation.
+    """
+    return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
+        dataset_fn)
+
   def experimental_run_v2(self, fn, args=(), kwargs=None):
     """Runs ops in `fn` on each replica, with the given arguments.
 
@@ -923,11 +970,23 @@ class StrategyExtendedV2(object):
 
     ```
     with my_strategy.scope():
-      iterator = my_strategy.make_dataset_iterator(dataset)
-      session.run(iterator.initialize())
-      replica_train_ops = my_strategy.experimental_run_v2(
-          replica_fn, args=(iterator.get_next(),))
-      train_op = my_strategy.group(replica_train_ops)
+      @tf.function
+      def distribute_train_epoch(dataset):
+        def replica_fn(input):
+          # process input and return result
+          return result
+
+        total_result = 0
+        for x in dataset:
+          per_replica_result = my_strategy.experimental_run_v2(replica_fn,
+                                                               args=(x,))
+          total_result += my_strategy.reduce(tf.distribute.ReduceOp.SUM,
+                                             per_replica_result, axis=None)
+        return total_result
+
+      dist_dataset = my_strategy.experimental_distribute_dataset(dataset)
+      for _ in range(EPOCHS):
+        train_result = distribute_train_epoch(dist_dataset)
     ```
 
     This takes an ordinary `dataset` and `replica_fn` and runs it
@@ -937,6 +996,11 @@ class StrategyExtendedV2(object):
     `replica_fn` can use the `get_replica_context()` API to get enhanced
     behavior in this case.
 
+    You can use the `reduce` API to aggregate results across replicas and use
+    this as a return value from one iteration over the distributed dataset. Or
+    you can use `tf.keras.metrics` such as loss, accuracy etc. to
+    accumulate metrics across steps in a given epoch.
+
   * If you want to write a distributed algorithm, you may use any of
     the `tf.distribute.Strategy` APIs inside a
     `with my_strategy.scope():` block of code.
@@ -1202,6 +1266,9 @@ class StrategyExtendedV2(object):
   def _experimental_distribute_dataset(self, dataset):
     raise NotImplementedError("must be implemented in descendants")
 
+  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+    raise NotImplementedError("must be implemented in descendants")
+
   def _reduce(self, reduce_op, value):
     # Default implementation until we have an implementation for each strategy.
     return self._local_results(
diff --git a/tensorflow/python/distribute/experimental/BUILD b/tensorflow/python/distribute/experimental/BUILD
index 76a36b8f081..581f18b815a 100644
--- a/tensorflow/python/distribute/experimental/BUILD
+++ b/tensorflow/python/distribute/experimental/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_library(
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 835d026a54b..213021a5050 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -18,6 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+
+import six
+
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.ops import dataset_ops
@@ -188,24 +192,12 @@ def _get_next_as_optional(iterator, strategy, name=None):
 class DistributedIterator(object):
   """Common implementation for all input iterators."""
 
-  def __init__(self, input_workers, iterators, strategy, **kwargs):
-    # TODO(b/128995245): We only enable get_next_as_optional in eager mode. In
-    # graph mode, the zero batch case in batch norm is not handled due to
-    # XLA-GPU regression.
-    if ops.executing_eagerly_outside_functions():
-      self._enable_get_next_as_optional = True
-    else:
-      self._enable_get_next_as_optional = False
-
-    if len(kwargs) > 1:
-      raise ValueError("DistributedIterator constructor only takes one "
-                       "experimental flag now")
-    if len(kwargs) == 1:
-      if "_enable_get_next_as_optional" not in kwargs:
-        raise ValueError("DistributedIterator constructor does not support "
-                         "arguments: {}".format(kwargs))
-      self._enable_get_next_as_optional = (
-          kwargs["_enable_get_next_as_optional"])
+  def __init__(self, input_workers, iterators, strategy):
+    # TODO(b/133073708): we currently need a flag to control the usage because
+    # there is a performance difference between get_next() and
+    # get_next_as_optional().
+    self._enable_get_next_as_optional = getattr(
+        strategy.extended, "experimental_enable_get_next_as_optional", False)
 
     assert isinstance(input_workers, InputWorkers)
     if not input_workers.worker_devices:
@@ -330,7 +322,71 @@ class DistributedIteratorV1(DistributedIterator):
     return None
 
 
-class DistributedDataset(object):
+class _IterableInput(object):
+  """Base class for iterable inputs for distribution strategies."""
+
+  def __init__(self, input_workers):
+    assert isinstance(input_workers, InputWorkers)
+    self._input_workers = input_workers
+
+  def __iter__(self):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _autograph_for_loop(self, extra_test, body, init_state):
+    """Overload of for..in statement that iterates over the input."""
+
+    if extra_test is not None:
+      raise NotImplementedError(
+          "break and return statements are not yet supported in "
+          "for ... in distributed input loops.")
+
+    def reduce_body(state, iterate):
+      new_state = body(iterate, *state)
+      return new_state
+
+    if init_state:
+      return self.reduce(init_state, reduce_body)
+
+    # TODO(anjalisridhar): This is a workaround for Dataset.reduce not allowing
+    # empty state tensors - create a dummy state variable that remains unused.
+    # Identify if we need this workaround and remove if unnecessary.
+    def reduce_body_with_dummy_state(state, iterate):
+      reduce_body((), iterate)
+      return state
+    self.reduce((constant_op.constant(0),), reduce_body_with_dummy_state)
+    return ()
+
+  def reduce(self, initial_state, reduce_fn):
+    """Execute a `reduce_fn` over all the elements of the input."""
+    iterator = iter(self)
+    has_data, data = _get_next_as_optional(iterator, self._strategy)
+
+    def cond(has_data, data, state):
+      del data, state  # Unused.
+      return has_data
+
+    def loop_body(has_data, data, state):
+      """Executes `reduce_fn` in a loop till the dataset is empty."""
+      del has_data  # Unused.
+      # data is list of lists here. where each list corresponds to one worker.
+      # TODO(b/130570614): Add support for the multiworker and TPU pods use
+      # case.
+      if self._input_workers.num_workers == 1:
+        data = data[0]
+      else:
+        raise ValueError("Dataset iteration within a tf.function is"
+                         " not supported for multiple workers.")
+      per_replica_data = values.regroup(self._input_workers.device_map, data)
+      state = reduce_fn(state, per_replica_data)
+      has_data, data = _get_next_as_optional(iterator, self._strategy)
+      return has_data, data, state
+
+    has_data, data, final_state = control_flow_ops.while_loop(
+        cond, loop_body, [has_data, data, initial_state])
+    return final_state
+
+
+class DistributedDataset(_IterableInput):
   """Wrapped tf.data.DatasetV2 that supports prefetching to multiple devices."""
 
   def __init__(self,
@@ -338,8 +394,7 @@ class DistributedDataset(object):
                input_workers,
                strategy,
                split_batch_by=None,
-               input_context=None,
-               **kwargs):
+               input_context=None):
     """Distribute the dataset on all workers.
 
     If `split_batch_by` is not None, we "split" each batch of the dataset by
@@ -356,16 +411,29 @@ class DistributedDataset(object):
         graph multi-worker cases where there is only one `input_worker`. In
         these cases, we will shard based on the `input_pipeline_id` and
         `num_input_pipelines` in the `InputContext`.
-      **kwargs: Additional experimental flags. Will be removed in future.
     """
+    super(DistributedDataset, self).__init__(input_workers=input_workers)
+
     # We clone and shard the dataset on each worker. The current setup tries to
     # shard the dataset by files if possible so that each worker sees a
     # different subset of files. If that is not possible, will attempt to shard
     # the final input such that each worker will run the entire preprocessing
     # pipeline and only receive its own shard of the dataset.
-    assert isinstance(input_workers, InputWorkers)
     if split_batch_by:
-      dataset = distribute._RebatchDataset(dataset, split_batch_by)  # pylint: disable=protected-access
+      try:
+        dataset = distribute._RebatchDataset(dataset, split_batch_by)  # pylint: disable=protected-access
+      except errors.InvalidArgumentError as e:
+        if "without encountering a batch" in str(e):
+          six.reraise(
+              ValueError,
+              ValueError(
+                  "Call the `batch` method on the input Dataset in order to be "
+                  "able to split your input across {} replicas.\n Please "
+                  "the tf.distribute.Strategy guide. {}".format(
+                      split_batch_by, e)),
+              sys.exc_info()[2])
+        else:
+          raise
 
     self._cloned_datasets = []
     if input_context:
@@ -392,67 +460,15 @@ class DistributedDataset(object):
     # iterator.
     self._element_structure = dataset._element_structure  # pylint: disable=protected-access
     self._strategy = strategy
-    self._kwargs = kwargs
 
   def __iter__(self):
     worker_iterators = _create_iterators_per_worker(self._cloned_datasets,
                                                     self._input_workers)
     iterator = DistributedIterator(self._input_workers, worker_iterators,
-                                   self._strategy, **self._kwargs)
+                                   self._strategy)
     iterator._element_structure = self._element_structure  # pylint: disable=protected-access
     return iterator
 
-  def _autograph_for_loop(self, extra_test, body, init_state):
-    """Overload of for..in statement that iterates over a DistributedDataset."""
-
-    if extra_test is not None:
-      raise NotImplementedError(
-          "break and return statements are not yet supported in "
-          "for/DistributedDataset loops.")
-
-    def reduce_body(state, iterate):
-      new_state = body(iterate, *state)
-      return new_state
-
-    if init_state:
-      return self.reduce(init_state, reduce_body)
-
-    # TODO(anjalisridhar): This is a workaround for Dataset.reduce not allowing
-    # empty state tensors - create a dummy state variable that remains unused.
-    # Identify if we need this workaround and remove if unnecessary.
-    def reduce_body_with_dummy_state(state, iterate):
-      reduce_body((), iterate)
-      return state
-    self.reduce((constant_op.constant(0),), reduce_body_with_dummy_state)
-    return ()
-
-  def reduce(self, initial_state, reduce_fn):
-    """Execute a `reduce_fn` over all the elements of a dataset."""
-    iterator = self.__iter__()
-    has_data, data = _get_next_as_optional(iterator, self._strategy)
-
-    def cond(has_data, data, state):  # pylint: disable=unused-argument
-      return has_data
-
-    def loop_body(has_data, data, state):
-      """Executes `reduce_fn` in a loop till the dataset is empty."""
-      # data is list of lists here. where each list corresponds to one worker.
-      # TODO(b/130570614): Add support for the multiworker and TPU pods use
-      # case.
-      if self._input_workers.num_workers == 1:
-        data = data[0]
-      else:
-        raise ValueError("Dataset iteration within a tf.function is"
-                         " not supported for multiple workers.")
-      per_replica_data = values.regroup(self._input_workers.device_map, data)
-      state = reduce_fn(state, per_replica_data)
-      has_data, data = _get_next_as_optional(iterator, self._strategy)
-      return has_data, data, state
-
-    has_data, data, final_state = control_flow_ops.while_loop(
-        cond, loop_body, [has_data, data, initial_state])
-    return final_state
-
 
 class DistributedDatasetV1(DistributedDataset):
   """Wrapped tf.data.DatasetV1 that supports prefetching to multiple devices."""
@@ -462,16 +478,14 @@ class DistributedDatasetV1(DistributedDataset):
                input_workers,
                strategy,
                split_batch_by=None,
-               input_context=None,
-               **kwargs):
+               input_context=None):
     self._input_workers = input_workers
     super(DistributedDatasetV1, self).__init__(
         dataset,
         input_workers,
         strategy,
         split_batch_by=split_batch_by,
-        input_context=input_context,
-        **kwargs)
+        input_context=input_context)
 
   def make_one_shot_iterator(self):
     """Get a one time use iterator for DistributedDatasetV1."""
@@ -490,18 +504,60 @@ class DistributedDatasetV1(DistributedDataset):
     worker_iterators = _create_iterators_per_worker(self._cloned_datasets,
                                                     self._input_workers)
     iterator = DistributedIteratorV1(self._input_workers, worker_iterators,
-                                     self._strategy, **self._kwargs)
+                                     self._strategy)
     iterator._element_structure = self._element_structure  # pylint: disable=protected-access
     return iterator
 
 
+# TODO(priyag): Add other replication modes.
+class DistributedDatasetsFromFunction(_IterableInput):
+  """Inputs created from dataset function."""
+
+  def __init__(self, dataset_fn, input_workers, input_contexts, strategy):
+    """Makes an iterable from datasets created by the given function.
+
+    Args:
+      dataset_fn: A function that returns a `Dataset` given an `InputContext`.
+      input_workers: an `InputWorkers` object.
+      input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `dataset_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+      strategy: a `tf.distribute.Strategy` object, used to run all-reduce to
+        handle last partial batch.
+    """
+    super(DistributedDatasetsFromFunction, self).__init__(
+        input_workers=input_workers)
+
+    if input_workers.num_workers != len(input_contexts):
+      raise ValueError(
+          "Number of input workers (%d) is not same as number of "
+          "input_contexts (%d)" %
+          (input_workers.num_workers, len(input_contexts)))
+
+    self._dataset_fn = dataset_fn
+    self._input_workers = input_workers
+    self._input_contexts = input_contexts
+    self._strategy = strategy
+
+  def __iter__(self):
+    iterators = []
+    for i, ctx in enumerate(self._input_contexts):
+      worker = self._input_workers.worker_devices[i]
+      with ops.device(worker):
+        dataset = self._dataset_fn(ctx)
+        devices = self._input_workers.compute_devices_for_worker(i)
+        iterator = _SingleWorkerDatasetIterator(dataset, worker, devices)
+        iterators.append(iterator)
+
+    return DistributedIterator(self._input_workers, iterators, self._strategy)
+
+
 # TODO(anjalisridhar): This class will be soon be removed in favor of newer
 # APIs.
 class InputFunctionIterator(DistributedIteratorV1):
   """Iterator created from input function."""
 
-  def __init__(self, input_fn, input_workers, input_contexts, strategy,
-               **kwargs):
+  def __init__(self, input_fn, input_workers, input_contexts, strategy):
     """Make an iterator for input provided via an input function.
 
     Currently implements PER_WORKER mode, in which the `input_fn` is called
@@ -517,7 +573,6 @@ class InputFunctionIterator(DistributedIteratorV1):
         `worker_device_pairs`.
       strategy: a `tf.distribute.Strategy` object, used to run all-reduce to
         handle last partial batch.
-      **kwargs: Additional experimental flags. Will be removed in future.
     """
     assert isinstance(input_workers, InputWorkers)
     if input_workers.num_workers != len(input_contexts):
@@ -542,7 +597,7 @@ class InputFunctionIterator(DistributedIteratorV1):
         iterators.append(iterator)
 
     super(InputFunctionIterator, self).__init__(input_workers, iterators,
-                                                strategy, **kwargs)
+                                                strategy)
 
 
 # TODO(anjalisridhar): This class will soon be removed and users should move
@@ -555,8 +610,7 @@ class DatasetIterator(DistributedIteratorV1):
                input_workers,
                strategy,
                split_batch_by=None,
-               input_context=None,
-               **kwargs):
+               input_context=None):
     """Make an iterator for the dataset on given devices.
 
     If `split_batch_by` is not None, we "split" each batch of the
@@ -573,7 +627,6 @@ class DatasetIterator(DistributedIteratorV1):
         graph multi-worker cases where there is only one `input_worker`. In
         these cases, we will shard based on the `input_pipeline_id` and
         `num_input_pipelines` in the `InputContext`.
-      **kwargs: Additional experimental flags. Will be removed in future.
     """
     dist_dataset = DistributedDatasetV1(
         dataset,
@@ -586,8 +639,7 @@ class DatasetIterator(DistributedIteratorV1):
     super(DatasetIterator, self).__init__(
         input_workers,
         worker_iterators,  # pylint: disable=protected-access
-        strategy,
-        **kwargs)
+        strategy)
     self._element_structure = dist_dataset._element_structure  # pylint: disable=protected-access
 
 
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 0a96632ead2..304cd95c4cc 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -53,7 +53,6 @@ class DistributedIteratorTestBase(test.TestCase):
                      input_workers,
                      devices,
                      split_batch_by,
-                     enable_get_next_as_optional,
                      strategy,
                      input_context=None):
     # The `input_context` passed in is to shard dataset for
@@ -79,16 +78,14 @@ class DistributedIteratorTestBase(test.TestCase):
           dataset_fn,
           input_workers,
           input_contexts,
-          strategy,
-          _enable_get_next_as_optional=enable_get_next_as_optional)
+          strategy)
     else:
       iterator = input_lib.DatasetIterator(
           dataset_fn(input_context),
           input_workers,
           strategy,
           split_batch_by=split_batch_by,
-          input_context=input_context,
-          _enable_get_next_as_optional=enable_get_next_as_optional)
+          input_context=input_context)
     return iterator
 
   def _wrap_dataset(self,
@@ -96,7 +93,6 @@ class DistributedIteratorTestBase(test.TestCase):
                     dataset,
                     input_workers,
                     split_batch_by,
-                    enable_get_next_as_optional,
                     strategy,
                     input_context=None):
     if isinstance(dataset, dataset_ops.Dataset):
@@ -105,16 +101,14 @@ class DistributedIteratorTestBase(test.TestCase):
           input_workers,
           strategy,
           split_batch_by=split_batch_by,
-          input_context=input_context,
-          _enable_get_next_as_optional=enable_get_next_as_optional)
+          input_context=input_context)
     else:
       return input_lib.DistributedDataset(
           dataset,
           input_workers,
           strategy,
           split_batch_by=split_batch_by,
-          input_context=input_context,
-          _enable_get_next_as_optional=enable_get_next_as_optional)
+          input_context=input_context)
 
   def _test_input_iteration(self,
                             input_type,
@@ -126,7 +120,6 @@ class DistributedIteratorTestBase(test.TestCase):
                             strategy,
                             sess=None,
                             split_batch_by=None,
-                            enable_get_next_as_optional=False,
                             input_context=None):
     if iteration_type == "for_loop" and not context.executing_eagerly():
       self.skipTest("unsupported test combination.")
@@ -148,7 +141,6 @@ class DistributedIteratorTestBase(test.TestCase):
           input_workers,
           devices,
           split_batch_by,
-          enable_get_next_as_optional,
           strategy,
           input_context=input_context)
     else:
@@ -159,7 +151,6 @@ class DistributedIteratorTestBase(test.TestCase):
           given_dataset,
           input_workers,
           split_batch_by,
-          enable_get_next_as_optional,
           strategy,
           input_context=input_context)
 
@@ -248,6 +239,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
 
     expected_values = [[i] for i in range(10)]
 
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
     self._test_input_iteration(
         input_type,
         api_type,
@@ -255,8 +248,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         dataset_fn,
         worker_device_pairs,
         expected_values,
-        distribution,
-        enable_get_next_as_optional=enable_get_next_as_optional)
+        distribution)
 
   @combinations.generate(
       combinations.combine(
@@ -279,6 +271,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
 
     expected_values = [[i, i+1] for i in range(0, 10, 2)]
 
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
     self._test_input_iteration(
         input_type,
         api_type,
@@ -286,8 +280,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         dataset_fn,
         worker_device_pairs,
         expected_values,
-        distribution,
-        enable_get_next_as_optional=enable_get_next_as_optional)
+        distribution)
 
   @combinations.generate(
       combinations.combine(
@@ -312,6 +305,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
 
     expected_values = [[i, i + 1] for i in range(0, 10, 2)]
 
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
     self._test_input_iteration(
         input_type,
         api_type,
@@ -319,8 +314,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         dataset_fn,
         worker_device_pairs,
         expected_values,
-        distribution,
-        enable_get_next_as_optional=enable_get_next_as_optional)
+        distribution)
 
   @combinations.generate(
       combinations.combine(
@@ -350,6 +344,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
 
     expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
 
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
     self._test_input_iteration(
         input_type,
         api_type,
@@ -357,8 +353,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         dataset_fn,
         worker_device_pairs,
         expected_values,
-        distribution,
-        enable_get_next_as_optional=enable_get_next_as_optional)
+        distribution)
 
   @combinations.generate(
       combinations.combine(
@@ -380,6 +375,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
 
     # The last global batch only contains data for one replica.
     expected_values = [[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8], []]]
+    distribution.extended.experimental_enable_get_next_as_optional = True
     self._test_input_iteration(
         input_type,
         api_type,
@@ -387,8 +383,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         dataset_fn,
         worker_device_pairs,
         expected_values,
-        distribution,
-        enable_get_next_as_optional=True)
+        distribution)
 
   @combinations.generate(
       combinations.combine(
@@ -418,6 +413,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
                         range(i+updated_batch_size, i+2*updated_batch_size)]
                        for i in range(0, 100, updated_batch_size*2)]
 
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
     self._test_input_iteration(
         input_type,
         api_type,
@@ -427,8 +424,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         expected_values,
         distribution,
         sess=None,
-        split_batch_by=split_batch_by,
-        enable_get_next_as_optional=True)
+        split_batch_by=split_batch_by)
 
 
 class DistributedIteratorMultiWorkerTest(
@@ -465,6 +461,10 @@ class DistributedIteratorMultiWorkerTest(
     ds_option = dataset_ops.Options()
     ds_option.experimental_distribute.auto_shard = autoshard
 
+    strategy = mirrored_strategy.MirroredStrategy(
+        devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]),
+        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
+            ["/job:worker/task:0", "/job:worker/task:1"], 1))
     worker_devices = self._cpu_devices()
     with context.graph_mode(), self.cached_session() as sess:
       if tf2.enabled():
@@ -480,7 +480,7 @@ class DistributedIteratorMultiWorkerTest(
         expected_values = [[0, 0], [1, 1], [2, 2], [3, 3]]
       self._test_input_iteration(input_type, api_type, iteration_type,
                                  dataset_fn, worker_devices,
-                                 expected_values, sess)
+                                 expected_values, strategy, sess)
 
   @combinations.generate(
       combinations.combine(
@@ -507,6 +507,8 @@ class DistributedIteratorMultiWorkerTest(
         expected_values = [[0, 1], [2, 3]]
       else:
         expected_values = [[0, 0], [1, 1], [2, 2], [3, 3]]
+      strategy.extended.experimental_enable_get_next_as_optional = (
+          enable_get_next_as_optional)
       self._test_input_iteration(
           input_type,
           api_type,
@@ -515,8 +517,7 @@ class DistributedIteratorMultiWorkerTest(
           worker_devices,
           expected_values,
           strategy,
-          sess=sess,
-          enable_get_next_as_optional=enable_get_next_as_optional)
+          sess=sess)
 
   @combinations.generate(
       combinations.combine(
@@ -545,6 +546,8 @@ class DistributedIteratorMultiWorkerTest(
         expected_values = [[0, 2, 1, 3]]
       else:
         expected_values = [[0, 1, 0, 1], [2, 3, 2, 3]]
+      strategy.extended.experimental_enable_get_next_as_optional = (
+          enable_get_next_as_optional)
       self._test_input_iteration(
           input_type,
           api_type,
@@ -553,8 +556,7 @@ class DistributedIteratorMultiWorkerTest(
           worker_devices,
           expected_values,
           strategy,
-          sess=sess,
-          enable_get_next_as_optional=enable_get_next_as_optional)
+          sess=sess)
 
   @combinations.generate(
       combinations.combine(
@@ -588,6 +590,8 @@ class DistributedIteratorMultiWorkerTest(
         expected_values = [[(0, 0), (1, 1)], [(2, 4), (3, 9)]]
       else:
         expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
+      strategy.extended.experimental_enable_get_next_as_optional = (
+          enable_get_next_as_optional)
       self._test_input_iteration(
           input_type,
           api_type,
@@ -596,8 +600,7 @@ class DistributedIteratorMultiWorkerTest(
           worker_devices,
           expected_values,
           strategy,
-          sess=sess,
-          enable_get_next_as_optional=enable_get_next_as_optional)
+          sess=sess)
 
   @combinations.generate(
       combinations.combine(
@@ -624,6 +627,7 @@ class DistributedIteratorMultiWorkerTest(
       else:
         expected_values = [[[0, 1], [2, 3], [0, 1], [2, 3]],
                            [[4, 5], [6, 7], [4, 5], [6, 7]], [[8], [], [8], []]]
+      strategy.extended.experimental_enable_get_next_as_optional = True
       self._test_input_iteration(
           input_type,
           api_type,
@@ -632,8 +636,7 @@ class DistributedIteratorMultiWorkerTest(
           worker_devices,
           expected_values,
           strategy,
-          sess=sess,
-          enable_get_next_as_optional=True)
+          sess=sess)
 
   @combinations.generate(
       combinations.combine(
@@ -689,6 +692,7 @@ class DistributedIteratorMultiWorkerTest(
           expected_values = [[[0, 1]], [[2, 3]], [[4]]]
           input_context = None
 
+        strategy.extended.experimental_enable_get_next_as_optional = True
         self._test_input_iteration(
             input_type,
             api_type,
@@ -699,7 +703,6 @@ class DistributedIteratorMultiWorkerTest(
             expected_values,
             strategy,
             sess=sess,
-            enable_get_next_as_optional=True,
             input_context=input_context)
         return True
 
@@ -728,6 +731,7 @@ class DistributedIteratorMultiWorkerTest(
 
       expected_values = [[[0, 1], [2, 3], [0, 1], [2, 3]],
                          [[4, 5], [6, 7], [4, 5], [6, 7]], [[], [], [8], []]]
+      strategy.extended.experimental_enable_get_next_as_optional = True
       self._test_input_iteration(
           input_type,
           api_type,
@@ -736,8 +740,7 @@ class DistributedIteratorMultiWorkerTest(
           worker_devices,
           expected_values,
           strategy,
-          sess=sess,
-          enable_get_next_as_optional=True)
+          sess=sess)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/distribute/minimize_loss_test.py b/tensorflow/python/distribute/minimize_loss_test.py
index 5bc41be3c27..c9422ab8014 100644
--- a/tensorflow/python/distribute/minimize_loss_test.py
+++ b/tensorflow/python/distribute/minimize_loss_test.py
@@ -47,10 +47,12 @@ VAR_MAP_V1 = {
 }
 
 VAR_MAP_V2 = {
-    "SGD": ("dense/bias", "learning_rate", "decay", "iter", "dense/kernel",
-            "momentum"),
-    "Adagrad": ("iter", "dense/bias", "dense/kernel", "learning_rate", "decay",
-                "dense/kernel/accumulator", "dense/bias/accumulator")
+    "SGD": ("dense/bias", "SGD/learning_rate", "SGD/decay", "SGD/iter",
+            "dense/kernel", "SGD/momentum"),
+    "Adagrad":
+        ("Adagrad/iter", "dense/bias", "dense/kernel", "Adagrad/learning_rate",
+         "Adagrad/decay", "Adagrad/dense/kernel/accumulator",
+         "Adagrad/dense/bias/accumulator")
 }
 
 
@@ -81,8 +83,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               use_callable_loss=[True, False]))
   def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss):
     with distribution.scope():
+      optimizer = optimizer_fn()
       model_fn, dataset_fn, layer = minimize_loss_example(
-          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
+          optimizer, use_bias=True, use_callable_loss=use_callable_loss)
 
       def step_fn(ctx, inputs):
         del ctx  # Unused
@@ -123,8 +126,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
   def testTrainNetworkByCallForEachReplica(self, distribution, optimizer_fn,
                                            use_callable_loss):
     with distribution.scope():
+      optimizer = optimizer_fn()
       model_fn, dataset_fn, layer = minimize_loss_example(
-          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
+          optimizer, use_bias=True, use_callable_loss=use_callable_loss)
 
       iterator = self._get_iterator(distribution, dataset_fn)
 
@@ -171,11 +175,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     # `distribution.scope`.
     with variable_scope.variable_creator_scope(
         appending_creator), distribution.scope():
+      optimizer = optimizer_fn()
       model_fn, dataset_fn, _ = minimize_loss_example(
-          optimizer_fn,
-          use_bias=True,
-          use_callable_loss=True,
-          create_optimizer_inside_model_fn=True)
+          optimizer, use_bias=True, use_callable_loss=True)
 
       def step_fn(ctx, inputs):
         del ctx  # Unused
@@ -195,8 +197,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       self.evaluate(variables_lib.global_variables_initializer())
       run_step()
 
-      def get_expected_variables(optimizer_fn, num_parameter_devices):
-        optimizer = optimizer_fn()
+      def get_expected_variables(num_parameter_devices):
         name = optimizer._name
 
         if isinstance(optimizer, optimizer_v2.OptimizerV2):
@@ -213,8 +214,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         return set([v + ":0" for v in variables])
 
       self.assertEqual(
-          get_expected_variables(optimizer_fn,
-                                 len(distribution.extended.parameter_devices)),
+          get_expected_variables(len(distribution.extended.parameter_devices)),
           set(created_variables))
 
   @combinations.generate(
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 9354344926c..eea043afc13 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -616,6 +616,21 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     return numpy_dataset.one_host_numpy_dataset(
         numpy_input, self._host_input_device, session)
 
+  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+    input_contexts = []
+    num_workers = self._input_workers.num_workers
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+
+    return input_lib.DistributedDatasetsFromFunction(
+        dataset_fn,
+        self._input_workers,
+        input_contexts,
+        self._container_strategy())
+
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
                                           initial_loop_values=None):
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index d49eb620529..be92a7071cb 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -155,6 +155,16 @@ class MirroredTwoDeviceDistributionTest(
     finally:
       self.set_v2_tensorshape(original_v2)
 
+  def testReplicateDataset(self, distribution):
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    self._test_input_fn_iterable(distribution, input_fn, expected_values)
+
   def testMakeInputFnIteratorWithDataset(self, distribution):
     dataset_fn = lambda: dataset_ops.Dataset.range(10)
     expected_values = [[i, i+1] for i in range(0, 10, 2)]
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index 4d87b9114e4..9ca439885d5 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -41,7 +41,6 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -315,7 +314,7 @@ class MultiWorkerTestBase(test.TestCase):
       **kwargs: will be passed to `client_fn`.
     """
     threads = []
-    for task_type in [run_config.TaskType.CHIEF, run_config.TaskType.WORKER]:
+    for task_type in ['chief', 'worker']:
       for task_id in range(len(cluster_spec.get(task_type, []))):
         t = threading.Thread(
             target=self._run_client,
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index b241b04dcf8..08b4267cdea 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -113,6 +113,13 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
     return input_lib.get_distributed_dataset(dataset, self._input_workers,
                                              self._container_strategy())
 
+  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+    return input_lib.DistributedDatasetsFromFunction(
+        dataset_fn,
+        self._input_workers,
+        [distribute_lib.InputContext()],
+        self._container_strategy())
+
   # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
   def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
                                           initial_loop_values=None):
diff --git a/tensorflow/python/distribute/one_device_strategy_test.py b/tensorflow/python/distribute/one_device_strategy_test.py
index 023b2ba5e0e..b2b97973fde 100644
--- a/tensorflow/python/distribute/one_device_strategy_test.py
+++ b/tensorflow/python/distribute/one_device_strategy_test.py
@@ -48,6 +48,16 @@ class OneDeviceStrategyTest(
   def testCallAndMergeExceptions(self, distribution):
     self._test_call_and_merge_exceptions(distribution)
 
+  def testReplicateDataset(self, distribution):
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i] for i in range(10)]
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=1,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    self._test_input_fn_iterable(distribution, input_fn, expected_values)
+
   def testMakeInputFnIteratorWithDataset(self, distribution):
     dataset_fn = lambda: dataset_ops.Dataset.range(10)
     expected_values = [[i] for i in range(10)]
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index f9067158015..7e013cb593e 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -321,6 +321,27 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     return numpy_dataset.one_host_numpy_dataset(
         numpy_input, self._input_host_device, session)
 
+  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+    if self._cluster_spec:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+      num_input_pipelines = multi_worker_util.worker_count(
+          self._cluster_spec, self._task_type)
+    else:
+      input_pipeline_id = 0
+      num_input_pipelines = 1
+
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=num_input_pipelines,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+
+    return input_lib.DistributedDatasetsFromFunction(
+        dataset_fn,
+        self._input_workers,
+        [input_context],
+        self._container_strategy())
+
   def _broadcast_to(self, tensor, destinations):
     # This is both a fast path for Python constants, and a way to delay
     # converting Python values to a tensor until we know what type it
diff --git a/tensorflow/python/distribute/single_loss_example.py b/tensorflow/python/distribute/single_loss_example.py
index 885a3ecd679..e9c7441fd07 100644
--- a/tensorflow/python/distribute/single_loss_example.py
+++ b/tensorflow/python/distribute/single_loss_example.py
@@ -50,27 +50,15 @@ def single_loss_example(optimizer_fn, distribution, use_bias=False,
   return single_loss_step, layer
 
 
-def minimize_loss_example(optimizer_fn,
-                          use_bias=False,
-                          use_callable_loss=True,
-                          create_optimizer_inside_model_fn=False):
+def minimize_loss_example(optimizer, use_bias=False, use_callable_loss=True):
   """Example of non-distribution-aware legacy code."""
 
-  if isinstance(optimizer_fn(), optimizer_v2.OptimizerV2):
-    # Keras optimizer v2 always uses callable loss
-    assert use_callable_loss
-
   def dataset_fn():
     dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
     # TODO(isaprykin): batch with drop_remainder causes shapes to be
     # fully defined for TPU.  Remove this when XLA supports dynamic shapes.
     return dataset.batch(1, drop_remainder=True)
 
-  # An Optimizer instance is created either outside or inside model_fn.
-  outer_optimizer = None
-  if not create_optimizer_inside_model_fn:
-    outer_optimizer = optimizer_fn()
-
   layer = core.Dense(1, use_bias=use_bias)
 
   def model_fn(x):
@@ -80,12 +68,9 @@ def minimize_loss_example(optimizer_fn,
       y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
       return y * y
 
-    optimizer = outer_optimizer or optimizer_fn()
-
     if isinstance(optimizer, optimizer_v2.OptimizerV2):
       return optimizer.minimize(loss_fn, lambda: layer.trainable_variables)
-
-    if use_callable_loss:
+    elif use_callable_loss:
       return optimizer.minimize(loss_fn)
     else:
       return optimizer.minimize(loss_fn())
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index b47dbf33c25..484b0dd20a4 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -306,6 +306,32 @@ class DistributionTestBase(test.TestCase):
 
     return _input_fn
 
+  def _test_input_fn_iterable(
+      self, strategy, input_fn, expected_values, ignore_order=False):
+    if not context.executing_eagerly():
+      self.skipTest("Only supported with eager execution.")
+
+    assert_same = self.assertCountEqual if ignore_order else self.assertEqual
+
+    iterable = strategy.experimental_distribute_datasets_from_function(input_fn)
+    iterator = iter(iterable)
+
+    for expected_value in expected_values:
+      computed_value = self.evaluate(
+          list(strategy.experimental_local_results(next(iterator))))
+      assert_same(expected_value, computed_value)
+
+    with self.assertRaises(StopIteration):
+      self.evaluate(strategy.experimental_local_results(next(iterator)))
+
+    # After re-initializing the iterator, should be able to iterate again.
+    iterator = iter(iterable)
+
+    for expected_value in expected_values:
+      computed_value = self.evaluate(
+          list(strategy.experimental_local_results(next(iterator))))
+      assert_same(expected_value, computed_value)
+
   def _test_input_fn_iterator(self,
                               iterator,
                               devices,
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 4d55d939b72..5a5c75e8119 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 import collections
 import copy
+import weakref
+
+import numpy as np
 
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
@@ -30,14 +33,17 @@ from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.tpu import tpu
@@ -150,66 +156,9 @@ class TPUStrategy(distribute_lib.Strategy):
   # This implementation runs a single step. It does not use infeed or outfeed.
   def experimental_run_v2(self, fn, args=(), kwargs=None):
     """See base class."""
-    return _tpu_run(self, fn, args, kwargs)
+    return self.extended.tpu_run(fn, args, kwargs)
 
 
-def _tpu_run(strategy, fn, args, kwargs):
-  """Common implementation of TPUStrategy.experimental_run_v2."""
-  if context.executing_eagerly() and not ops.inside_function():
-    raise NotImplementedError(
-        "Eager mode not supported in TPUStrategy outside TF functions.")
-
-  if kwargs is None:
-    kwargs = {}
-
-  # Used to re-structure flattened output tensors from `tpu.replicate()`
-  # into a structured format.
-  result = [[]]
-
-  def replicated_fn(replica_id, replica_args, replica_kwargs):
-    """Wraps user function to provide replica ID and `Tensor` inputs."""
-    with _TPUReplicaContext(strategy, replica_id_in_sync_group=replica_id):
-      result[0] = fn(*replica_args, **replica_kwargs)
-    return result[0]
-
-  replicate_inputs = []  # By replica.
-  for i in range(strategy.num_replicas_in_sync):
-    replicate_inputs.append(
-        [constant_op.constant(i, dtype=dtypes.int32),
-         values.select_replica(i, args),
-         values.select_replica(i, kwargs)])
-
-  # Construct and pass `maximum_shapes` so that we could support dynamic
-  # shapes using dynamic padder.
-  if replicate_inputs:
-    maximum_shapes = []
-    flattened_list = nest.flatten(replicate_inputs[0])
-    for input_tensor in flattened_list:
-      maximum_shapes.append(input_tensor.get_shape())
-    maximum_shapes = nest.pack_sequence_as(replicate_inputs[0],
-                                           maximum_shapes)
-  else:
-    maximum_shapes = None
-
-  with strategy.scope():
-    replicate_outputs = tpu.replicate(replicated_fn, replicate_inputs,
-                                      maximum_shapes=maximum_shapes)
-
-  # Remove all no ops that may have been added during 'tpu.replicate()'
-  if isinstance(result[0], list):
-    result[0] = [
-        output for output in result[0] if tensor_util.is_tensor(output)
-    ]
-
-  # Workaround for `tpu.replicate` behaviour when single `Tensor` returned.
-  replicate_outputs = [
-      nest.pack_sequence_as(result[0], nest.flatten(replica_output))
-      for replica_output in replicate_outputs
-  ]
-
-  device_map = strategy.extended._device_map  # pylint: disable=protected-access
-  return values.regroup(device_map, replicate_outputs)
-
 
 @tf_export(v1=["distribute.experimental.TPUStrategy"])
 class TPUStrategyV1(distribute_lib.StrategyV1):
@@ -246,7 +195,7 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
   # This implementation runs a single step. It does not use infeed or outfeed.
   def experimental_run_v2(self, fn, args=(), kwargs=None):
     """See base class."""
-    return _tpu_run(self, fn, args, kwargs)
+    return self.extended.tpu_run(fn, args, kwargs)
 
 
 # TODO(josh11b): Switch to V2 when we no longer need to support tf.compat.v1.
@@ -268,6 +217,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       # not specified.
       steps_per_run = 1
 
+    self._tpu_function_cache = weakref.WeakKeyDictionary()
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
     self._device_assignment = device_assignment
@@ -311,6 +261,8 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     self.steps_per_run = steps_per_run
     self._require_static_shapes = True
 
+    self.experimental_enable_get_next_as_optional = True
+
   def _validate_colocate_with_variable(self, colocate_with_variable):
     values.validate_colocate_tpu_variable(colocate_with_variable, self)
 
@@ -320,8 +272,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         dataset,
         self._input_workers,
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync,
-        _enable_get_next_as_optional=True)
+        split_batch_by=self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
       self,
@@ -338,8 +289,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         input_fn,
         self._input_workers,
         input_contexts,
-        self._container_strategy(),
-        _enable_get_next_as_optional=True)
+        self._container_strategy())
 
   def _experimental_make_numpy_dataset(self, numpy_input, session):
     return numpy_dataset.one_host_numpy_dataset(
@@ -353,6 +303,21 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         self._container_strategy(),
         split_batch_by=self._num_replicas_in_sync)
 
+  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+    input_contexts = []
+    num_workers = self._input_workers.num_workers
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+
+    return input_lib.DistributedDatasetsFromFunction(
+        dataset_fn,
+        self._input_workers,
+        input_contexts,
+        self._container_strategy())
+
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
@@ -475,6 +440,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       logical_device = colocate_with.logical_device
 
     def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
+      initial_value = None
       value_list = []
       for i, d in enumerate(devices):
         with ops.device(d):
@@ -486,15 +452,21 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
             # name as the absolute name of the variable.
             kwargs["name"] = "%s/replica_%d/" % (var0name, i)
             # Initialize replicas with the same value:
-            def initial_value_fn(device=d):
-              if context.executing_eagerly() or ops.inside_function():
-                return array_ops.identity(value_list[0].value())
-              else:
-                with ops.device(device):
-                  return array_ops.identity(value_list[0].initial_value)
+            def initial_value_fn():
+              return array_ops.identity(initial_value)
+
             kwargs["initial_value"] = initial_value_fn
           with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
             v = next_creator(*args, **kwargs)
+          if i == 0:
+            # To avoid incorrectly nested device scopes, we exit out of
+            # existing control flow scopes and function building graphs.
+            # TODO(b/132997073): Remove initialization scope once nested
+            # device scope issue has been fixed.
+            with ops.init_scope():
+              initial_value = (
+                  v.value() if ops.executing_eagerly_outside_functions() else
+                  v.initial_value)
           assert not isinstance(v, values.TPUMirroredVariable)
           value_list.append(v)
       return value_list
@@ -542,7 +514,8 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     return output
 
   def _update(self, var, fn, args, kwargs, group):
-    assert isinstance(var, values.TPUMirroredVariable)
+    assert isinstance(var, values.TPUMirroredVariable) or isinstance(
+        var, resource_variable_ops.ResourceVariable)
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
       if group:
         return fn(var, *args, **kwargs)
@@ -562,7 +535,8 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     return values.update_regroup(self, self._device_map, updates, group)
 
   def read_var(self, var):
-    assert isinstance(var, values.TPUMirroredVariable)
+    assert isinstance(var, values.TPUMirroredVariable) or isinstance(
+        var, resource_variable_ops.ResourceVariable)
     return var.read_value()
 
   def _local_results(self, val):
@@ -685,6 +659,81 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     """
     return True
 
+  def tpu_run(self, fn, args, kwargs):
+    func = self._tpu_function_creator(fn)
+    return func(args, kwargs)
+
+  def _tpu_function_creator(self, fn):
+    if fn in self._tpu_function_cache:
+      return self._tpu_function_cache[fn]
+
+    strategy = self._container_strategy()
+
+    def tpu_function(args, kwargs):
+      """TF Function used to replicate the user computation."""
+      if kwargs is None:
+        kwargs = {}
+
+      # Used to re-structure flattened output tensors from `tpu.replicate()`
+      # into a structured format.
+      result = [[]]
+
+      def replicated_fn(replica_id, replica_args, replica_kwargs):
+        """Wraps user function to provide replica ID and `Tensor` inputs."""
+        with _TPUReplicaContext(strategy, replica_id_in_sync_group=replica_id):
+          result[0] = fn(*replica_args, **replica_kwargs)
+        return result[0]
+
+      replicate_inputs = []  # By replica.
+      for i in range(strategy.num_replicas_in_sync):
+        replicate_inputs.append(
+            [constant_op.constant(i, dtype=dtypes.int32),
+             values.select_replica(i, args),
+             values.select_replica(i, kwargs)])
+
+      # Construct and pass `maximum_shapes` so that we could support dynamic
+      # shapes using dynamic padder.
+      if replicate_inputs:
+        maximum_shapes = []
+        flattened_list = nest.flatten(replicate_inputs[0])
+        for input_tensor in flattened_list:
+          if tensor_util.is_tensor(input_tensor):
+            maximum_shape = input_tensor.get_shape()
+          else:
+            maximum_shape = tensor_shape.TensorShape(np.shape(input_tensor))
+          maximum_shapes.append(maximum_shape)
+        maximum_shapes = nest.pack_sequence_as(replicate_inputs[0],
+                                               maximum_shapes)
+      else:
+        maximum_shapes = None
+
+      with strategy.scope():
+        replicate_outputs = tpu.replicate(replicated_fn, replicate_inputs,
+                                          maximum_shapes=maximum_shapes)
+
+      # Remove all no ops that may have been added during 'tpu.replicate()'
+      if isinstance(result[0], list):
+        result[0] = [
+            output for output in result[0] if tensor_util.is_tensor(output)
+        ]
+
+      # Workaround for `tpu.replicate` behaviour when single `Tensor` returned.
+      if result[0] is None:
+        replicate_outputs = [None] * len(replicate_outputs)
+      else:
+        replicate_outputs = [
+            nest.pack_sequence_as(result[0], nest.flatten(replica_output))
+            for replica_output in replicate_outputs
+        ]
+      device_map = self._device_map  # pylint: disable=protected-access
+      return values.regroup(device_map, replicate_outputs)
+
+    if context.executing_eagerly():
+      tpu_function = def_function.function(tpu_function)
+
+    self._tpu_function_cache[fn] = tpu_function
+    return tpu_function
+
 
 class _TPUReplicaContext(distribute_lib.ReplicaContext):
   """Replication Context class for TPU Strategy."""
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 458ac15df22..90b81ec404f 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,4 +1,6 @@
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
@@ -93,6 +95,17 @@ py_library(
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:tf2",
         "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "context_test",
+    size = "small",
+    srcs = ["context_test.py"],
+    additional_deps = [
+        ":context",
+        ":test",
     ],
 )
 
@@ -595,6 +608,7 @@ tf_xla_py_test(
     tags = [
         "no_pip",
         "no_rocm",
+        "no_windows",
         "nomac",
     ],
     deps = [
@@ -643,3 +657,20 @@ py_library(
         "//tensorflow/python/eager:context",
     ],
 )
+
+tf_py_test(
+    name = "remote_test",
+    size = "medium",
+    srcs = ["remote_test.py"],
+    additional_deps = [
+        ":context",
+        ":def_function",
+        ":test",
+        ":remote",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+    tags = [
+        "no_oss",  # This test launches local server.
+        "optonly",  # times out
+    ],
+)
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index af126d3bf2c..550665ed240 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -723,7 +723,7 @@ class GradientTape(object):
   with tf.GradientTape(watch_accessed_variables=False) as tape:
     tape.watch(variable_a)
     y = variable_a ** 2  # Gradients will be available for `variable_a`.
-    z = variable_b ** 3  # No gradients will be avaialble since `variable_b` is
+    z = variable_b ** 3  # No gradients will be available since `variable_b` is
                          # not being watched.
   ```
 
@@ -993,7 +993,7 @@ class GradientTape(object):
                experimental_use_pfor=True):
     """Computes the jacobian using operations recorded in context of this tape.
 
-    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
+    See [wikipedia article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant) for the
     definition of a Jacobian.
 
     Example usage:
@@ -1094,15 +1094,16 @@ class GradientTape(object):
                      experimental_use_pfor=True):
     """Computes and stacks per-example jacobians.
 
-    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
-    definition of a Jacobian.  This function is essentially an efficient
+    See [wikipedia article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant) for the
+    definition of a Jacobian. This function is essentially an efficient
     implementation of the following:
+    
     `tf.stack([self.jacobian(y[i], x[i]) for i in range(x.shape[0])])`.
 
     Note that compared to `GradientTape.jacobian` which computes gradient of
     each output value w.r.t each input value, this function is useful when
-    `target[i,...] is independent of `source[j,...]` for `j != i`. This
-    independence assumption allows more efficient computation as compared to
+    `target[i,...]` is independent of `source[j,...]` for `j != i`. This
+    assumption allows more efficient computation as compared to
     `GradientTape.jacobian`. The output, as well as intermediate activations,
     are lower dimensional and avoid a bunch of redundant zeros which would
     result in the jacobian computation given the independence assumption.
@@ -1111,10 +1112,10 @@ class GradientTape(object):
 
     ```python
     with tf.GradientTape() as g:
-      x = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
+      x = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32)
       g.watch(x)
       y = x * x
-    batch_jacobian = g.batch_jacobian(y, x)
+    batch_jacobian = g.batch_jacobian(y, x) 
     # batch_jacobian is [[[2,  0], [0,  4]], [[6,  0], [0,  8]]]
     ```
 
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index bac5e5f785b..f57c9cb0d5c 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -55,12 +55,10 @@ class BackpropTest(test.TestCase):
       ind1 = constant_op.constant(np.array([0, 1]))
       ind2 = constant_op.constant(np.array([2, 3]))
       ind3 = constant_op.constant(np.array([1, 3]))
-      # A mixture of IndexedSlices and dense tensor to aggregate.
       g1 = embedding_ops.embedding_lookup(x, ind1)
       g2 = embedding_ops.embedding_lookup(x, ind2)
       g3 = embedding_ops.embedding_lookup(x, ind3)
-      g4 = math_ops.reduce_sum(x * constant_op.constant(2.0))
-      return g1 * g2 * g3 * g4
+      return g1 * g2 * g3
 
     var_np = np.random.rand(4, 2).astype(np.float32)
     var = constant_op.constant(var_np)
@@ -75,8 +73,7 @@ class BackpropTest(test.TestCase):
       tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1)
       tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2)
       tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3)
-      tf_g4 = math_ops.reduce_sum(tf_var * 2.0, axis=(0, 1))
-      tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4
+      tf_y = tf_g1 * tf_g2 * tf_g3
       tf_grad = gradients.gradients(tf_y, [tf_var])[0]
 
       tf_dense_grad = math_ops.unsorted_segment_sum(
@@ -84,6 +81,31 @@ class BackpropTest(test.TestCase):
 
       self.assertAllClose(grad, self.evaluate(tf_dense_grad))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testAggregateGradientsWithTensor(self):
+
+    def fn(x):
+      ind1 = constant_op.constant(np.array([0, 1]))
+      # A mixture of IndexedSlices and dense tensor to aggregate.
+      g1 = embedding_ops.embedding_lookup(x, ind1)
+      g2 = math_ops.reduce_sum(x * constant_op.constant(2.0))
+      return g1 * g2
+
+    var_np = np.random.rand(4, 2).astype(np.float32)
+    var = constant_op.constant(var_np)
+    grad = backprop.gradients_function(fn, [0])(var)[0]
+    grad = self.evaluate(ops.convert_to_tensor(grad))
+
+    if not context.executing_eagerly():
+      tf_var = array_ops.constant(var_np, dtypes.float32)
+      tf_ind1 = array_ops.constant([0, 1])
+      tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1)
+      tf_g2 = math_ops.reduce_sum(tf_var * 2.0, axis=(0, 1))
+      tf_y = tf_g1 * tf_g2
+      tf_grad = gradients.gradients(tf_y, [tf_var])[0]
+
+      self.assertAllClose(grad, tf_grad)
+
   def testImplicitGradWithResourceVariable(self):
     x = resource_variable_ops.ResourceVariable(
         initial_value=constant_op.constant(1.0), name='x')
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index ed0f1d6aab9..47e04201057 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -22,8 +22,9 @@ import collections
 import contextlib
 import copy
 import random
-import six
 import threading
+import numpy as np
+import six
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -335,6 +336,10 @@ class Context(object):
     self._execution_mode = execution_mode
     self._server_def = server_def
     self._collective_ops_server_def = None
+    self._collective_leader = None
+    self._collective_scoped_allocator_enabled_ops = None
+    self._collective_use_nccl_communication = None
+    self._collective_device_filters = None
 
     self._device_lock = threading.Lock()
     self._physical_devices = None
@@ -355,7 +360,14 @@ class Context(object):
   def _set_global_seed(self, seed):
     """Set a global eager mode seed for random ops."""
     self._seed = seed
-    self._rng = random.Random(self._seed)
+    # `random.Random(seed)` needs `seed` to be hashable, while values of type
+    # e.g. `np.int64` or `np.ndarray` are not. We use `int(...)` to convert them
+    # to int.
+    try:
+      hash(seed)
+    except TypeError:
+      seed = int(np.array(seed))
+    self._rng = random.Random(seed)
     # Also clear the kernel cache, to reset any existing seeds
     if self._context_handle is not None:
       pywrap_tensorflow.TFE_ContextClearCaches(self._context_handle)
@@ -466,9 +478,7 @@ class Context(object):
       self._initialize_logical_devices()
 
   def enable_collective_ops(self, server_def):
-    """Enable collective ops with an appropriate server_def.
-
-    If previously enabled, this cannot be re-enabled.
+    """Enable distributed collective ops with an appropriate server_def.
 
     Args:
       server_def: A tensorflow::ServerDef proto. Enables execution on remote
@@ -476,18 +486,58 @@ class Context(object):
 
     Raises:
       ValueError: if server_def is None.
+      RuntimeError: if this method is not called at program startup.
     """
     if not server_def:
       raise ValueError("server_def is None.")
-    if not self._context_handle:
-      self._collective_ops_server_def = server_def
-    else:
-      server_def_str = server_def.SerializeToString()
-      pywrap_tensorflow.TFE_EnableCollectiveOps(self._context_handle,
-                                                server_def_str)
 
-      self._clear_caches()
-      self._initialize_logical_devices()
+    if self._context_handle is not None:
+      raise RuntimeError("Collective ops must be enabled at program startup")
+
+    self._collective_ops_server_def = server_def
+
+  def configure_collective_ops(
+      self,
+      collective_leader="",
+      scoped_allocator_enabled_ops=("CollectiveReduce",),
+      use_nccl_communication=False,
+      device_filters=None):
+    """Configure collective ops.
+
+      Collective group leader is necessary for collective ops to run, other
+      configurations are mainly for the purpose of performance.
+
+    Args:
+      collective_leader: a device string for collective leader, e.g.
+        "/job:worker/replica:0/task:"; empty string means local execution of
+          collective ops.
+      scoped_allocator_enabled_ops: a tuple or a list of op names for scoped
+        allocator to run with.
+      use_nccl_communication: whether to use nccl communication for collective
+        ops.
+      device_filters: a tuple or a list of device strings. If set, corresponding
+        task can only see the devices filtered by these device filters.
+
+    Raises:
+      RuntimeError: if this method is not called at program startup.
+    """
+    if self._collective_leader is not None:
+      if (self._collective_leader != collective_leader or
+          self._collective_scoped_allocator_enabled_ops !=
+          scoped_allocator_enabled_ops or
+          self._collective_use_nccl_communication != use_nccl_communication or
+          self._collective_device_filters != device_filters):
+        raise ValueError("Collective ops are already configured.")
+      else:
+        return
+
+    if self._context_handle is not None:
+      raise RuntimeError("Collective ops must be configured at program startup")
+
+    self._collective_leader = collective_leader
+    self._collective_scoped_allocator_enabled_ops = scoped_allocator_enabled_ops
+    self._collective_use_nccl_communication = use_nccl_communication
+    self._collective_device_filters = device_filters
 
   @property
   def _handle(self):
@@ -747,6 +797,23 @@ class Context(object):
     gpu_options = self._compute_gpu_options()
     config.gpu_options.MergeFrom(gpu_options)
 
+    # Configure collective ops
+    if self._collective_leader:
+      config.experimental.collective_group_leader = self._collective_leader
+    if self._collective_scoped_allocator_enabled_ops:
+      rewrite_options = config.graph_options.rewrite_options
+      rewrite_options.scoped_allocator_optimization = (
+          rewriter_config_pb2.RewriterConfig.ON)
+      del rewrite_options.scoped_allocator_opts.enable_op[:]
+      for op in self._collective_scoped_allocator_enabled_ops:
+        rewrite_options.scoped_allocator_opts.enable_op.append(op)
+    if self._collective_use_nccl_communication:
+      config.experimental.collective_nccl = True
+    if self._collective_device_filters:
+      del config.device_filters[:]
+      for f in self._collective_device_filters:
+        config.device_filters.append(f)
+
     return config
 
   def _compute_gpu_options(self):
diff --git a/tensorflow/python/eager/context_test.py b/tensorflow/python/eager/context_test.py
new file mode 100644
index 00000000000..ba856b803fa
--- /dev/null
+++ b/tensorflow/python/eager/context_test.py
@@ -0,0 +1,40 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class ContextTest(test.TestCase):
+
+  def testSetGlobalSeed(self):
+    c = context.Context()
+    c._set_global_seed(123)
+    for t in [np.int32, np.int64, np.uint32, np.uint64]:
+      c._set_global_seed(t(123))
+      c._set_global_seed(np.array(123, dtype=t))
+      c._set_global_seed(ops.convert_to_tensor(123, dtype=t))
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 4a5f481114f..5a6acf4e399 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -133,7 +133,7 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
       initial_value = initial_value.wrapped_value
 
     with ops.name_scope(name, "Variable", []
-                        if init_from_fn else [initial_value]) as name:
+                        if init_from_fn else [initial_value]) as scope_name:
       with ops.name_scope("Initializer"), ops.device(None):
         initial_value = ops.convert_to_tensor(
             initial_value() if init_from_fn else initial_value,
@@ -145,19 +145,21 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
       if shape is None:
         shape = initial_value.shape
 
-      # Use the constructor for UninitializedVariable to start.
-      super(UnliftedInitializerVariable, self).__init__(
-          trainable=trainable,
-          caching_device=caching_device,
-          name=name,
-          shape=shape,
-          dtype=initial_value.dtype,
-          constraint=constraint,
-          synchronization=synchronization,
-          aggregation=aggregation,
-          extra_handle_data=initial_value,
-          **unused_kwargs)
+    # Use the constructor for UninitializedVariable to start. Outside the name
+    # scope so we don't double up the prefix.
+    super(UnliftedInitializerVariable, self).__init__(
+        trainable=trainable,
+        caching_device=caching_device,
+        name=name,
+        shape=shape,
+        dtype=initial_value.dtype,
+        constraint=constraint,
+        synchronization=synchronization,
+        aggregation=aggregation,
+        extra_handle_data=initial_value,
+        **unused_kwargs)
 
+    with ops.name_scope(scope_name):
       if self._in_graph_mode:
         with ops.init_scope():
           outer_graph = ops.get_default_graph()
@@ -248,7 +250,8 @@ class Function(object):
                name,
                input_signature=None,
                autograph=True,
-               experimental_autograph_options=None):
+               experimental_autograph_options=None,
+               experimental_relax_shapes=False):
     """Initializes a `Function`.
 
     Args:
@@ -262,6 +265,9 @@ class Function(object):
       experimental_autograph_options: optional tuple of
         tensorflow.autograph.Feature values. Allows enabling additional
         conversion options when autograph is set to True.
+      experimental_relax_shapes: When true, argument shapes may be relaxed to
+        avoid unecessary retracing.
+
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -272,6 +278,7 @@ class Function(object):
         python_function, input_signature)
     self._autograph = autograph
     self._experimental_autograph_options = experimental_autograph_options
+    self.experimental_relax_shapes = experimental_relax_shapes
     self._created_variables = None
     self._stateful_fn = None
     self._stateless_fn = None
@@ -308,12 +315,12 @@ class Function(object):
 
   def _defun(self, fn):
     """Returns a defun generated from the input function."""
-    # TODO(mdan): Pipe self._experimental_autograph_options through.
     return function_lib.defun(
         fn,
         input_signature=self.input_signature,
         autograph=self._autograph,
-        experimental_autograph_options=self._experimental_autograph_options)
+        experimental_autograph_options=self._experimental_autograph_options,
+        experimental_relax_shapes=self.experimental_relax_shapes)
 
   def _initialize(self, args, kwds, add_initializers_to=None):
     """Initializes, on the first call.
@@ -501,7 +508,6 @@ class Function(object):
     """Make and call a `ConcreteFunction` which initializes variables."""
 
     # Note: using defun here avoids an infinite recursion.
-    # Note: there is no reason not to autograph once the overhead is negligible.
     @function_lib.defun
     def initialize_variables():
       for v, init in initializer_map.items():
@@ -720,7 +726,8 @@ class Function(object):
 def function(func=None,
              input_signature=None,
              autograph=True,
-             experimental_autograph_options=None):
+             experimental_autograph_options=None,
+             experimental_relax_shapes=False):
   """Creates a callable TensorFlow graph from a Python function.
 
   `function` constructs a callable that executes a TensorFlow graph
@@ -975,6 +982,8 @@ def function(func=None,
     experimental_autograph_options: Experimental knobs (in the form of a tuple
       of tensorflow.autograph.Feature values) to control behavior when
       autograph=True.
+    experimental_relax_shapes: When true, argument shapes may be relaxed to
+      avoid unecessary retracing.
 
   Returns:
      If `func` is not None, returns a callable that will execute the compiled
@@ -1001,7 +1010,8 @@ def function(func=None,
             name,
             input_signature=input_signature,
             autograph=autograph,
-            experimental_autograph_options=experimental_autograph_options))
+            experimental_autograph_options=experimental_autograph_options,
+            experimental_relax_shapes=experimental_relax_shapes))
 
   # This code path is for the `foo = tf.function(foo, ...)` use case
   if func is not None:
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index df8da0fc084..2fcfde86bad 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -17,11 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from six.moves import range
-
 import functools
+import re
 import weakref
 
+from six.moves import range
+
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import lift_to_graph
@@ -33,6 +34,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -69,22 +71,6 @@ class _HasDecoratedMethod(object):
   def f(self, x):
     return x * 3.
 
-# pylint: disable=bad-continuation,anomalous-backslash-in-string
-MIXING_GRAPH_EAGER_TENSORS_ERROR = (
-"""An op outside of the function building code is being passed
-a "Graph" tensor. It is possible to have Graph tensors
-leak out of the function building context by including a
-tf.init_scope in your function building code.
-For example, the following function will fail:
-  @tf.function
-  def has_init_scope\(\):
-    my_constant = tf.constant\(1.\)
-    with tf.init_scope\(\):
-      added = my_constant \* 2
-The graph tensor has name: Const:0""")
-# pylint: enable=bad-continuation,anomalous-backslash-in-string
-
-
 class DefFunctionTest(test.TestCase):
 
   def testNoVariables(self):
@@ -308,6 +294,36 @@ class DefFunctionTest(test.TestCase):
                      (tensor_spec.TensorSpec(
                          None, dtypes.float32, name='x'),))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_variable_naming(self):
+    class HasVars(module.Module):
+
+      def __init__(self):
+        self.x = None
+        self.y = None
+        self.z = None
+
+      @def_function.function
+      def make_x(self):
+        if self.x is None:
+          self.x = variables.Variable(1., name='v')
+
+      def make_y(self):
+        if self.y is None:
+          self.y = variables.Variable(1., name='v')
+
+      def make_z(self):
+        if self.z is None:
+          with ops.name_scope('z_scope'):
+            self.z = variables.Variable(1., name='z')
+
+    root = HasVars()
+    root.make_x()
+    root.make_y()
+    root.make_z()
+    self.assertEqual('v:0', root.x.name)
+    self.assertEqual('z_scope/z:0', root.z.name)
+
   def test_concrete_function_keyword_arguments(self):
     @def_function.function
     def f(x):
@@ -427,9 +443,36 @@ class DefFunctionTest(test.TestCase):
       with ops.init_scope():
         _ = a + a
 
-    with self.assertRaisesRegexp(TypeError, MIXING_GRAPH_EAGER_TENSORS_ERROR):
+    with self.assertRaisesRegexp(
+        TypeError,
+        re.compile('An op outside of the function.*passed.*Const', re.DOTALL)):
       failing_function()
 
+  def testNonUniqueNamesGetConcreteFunction(self):
+    @def_function.function
+    def non_unique_arg_names(x, **kwargs):
+      a, b, c = x
+      d = kwargs['d']
+      return a + b + c + d
+
+    concrete = non_unique_arg_names.get_concrete_function(
+        (tensor_spec.TensorSpec(None, dtypes.float32),
+         tensor_spec.TensorSpec(None, dtypes.float32),
+         tensor_spec.TensorSpec(None, dtypes.float32)),
+        d=tensor_spec.TensorSpec(None, dtypes.float32))
+    self.assertAllClose(
+        10.,
+        concrete(x=constant_op.constant(1.),
+                 x_1=constant_op.constant(2.),
+                 x_2=constant_op.constant(3.),
+                 d=constant_op.constant(4.)))
+    self.assertAllClose(
+        10.,
+        concrete(constant_op.constant(1.),
+                 constant_op.constant(2.),
+                 constant_op.constant(3.),
+                 constant_op.constant(4.)))
+
   def testVariableCreatorScope(self):
     created_variables = []
     captured_variables = []
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index d1c058c7073..24377614031 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -256,21 +256,20 @@ class _InterpolateFunctionError(object):
     _, tags = error_interpolation.parse_message(message)
     g = None
     func_stack = []
-    # pylint: disable=protected-access
     for t in tags:
       if t.type == "function_node":
+        # TODO(mdan): Tests should cover this.
         if t.name == compat.as_str(self._func.name):
-          g = self._func._graph
+          g = self._func.graph
         elif g:
           next_func = g._get_function(t.name)
           if next_func is not None and isinstance(next_func,
                                                   _EagerDefinedFunction):
-            g = next_func._graph
+            g = next_func.graph
         if g:
           func_stack.append(g.name)
         else:
           func_stack.append("<unknown>")
-    # pylint: enable=protected-access
     if g:
       message = error_interpolation.interpolate(message, g)
       message += "\n\nFunction call stack:\n"
@@ -302,7 +301,19 @@ class _EagerDefinedFunctionDeleter(object):
     self.name = name
 
   def __del__(self):
-    context.remove_function(self.name)
+    try:
+      context.remove_function(self.name)
+    except TypeError:
+      # Suppress some exceptions, mainly for the case when we're running on
+      # module deletion. Things that can go wrong include the context module
+      # already being unloaded, self._handle._handle_data no longer being
+      # valid, and so on. Printing warnings in these cases is silly
+      # (exceptions raised from __del__ are printed as warnings to stderr).
+      pass  # 'NoneType' object is not callable when the handle has been
+      # partially unloaded.
+    except AttributeError:
+      pass  # 'NoneType' object has no attribute 'eager_mode' when context has
+      # been unloaded. Will catch other module unloads as well.
 
 
 # TODO(apassos) get rid of this by splitting framework.function._DefinedFunction
@@ -1273,6 +1284,7 @@ class Function(object):
                attributes=None,
                autograph=True,
                autograph_options=None,
+               experimental_relax_shapes=False,
                capture_by_value=None):
     """Initializes a `Function`.
 
@@ -1290,6 +1302,8 @@ class Function(object):
       autograph_options: Experimental knobs to control behavior
         `when autograph=True`. See https://www.tensorflow.org/guide/autograph
         for more information.
+      experimental_relax_shapes: When true, argument shapes may be relaxed to
+        avoid unecessary retracing.
       capture_by_value: Experimental. Whether to capture resource variables by
         value or reference. If None, will inherit from a parent context or
         default to False.
@@ -1304,6 +1318,7 @@ class Function(object):
     self._name = name
     self._autograph = autograph
     self._autograph_options = autograph_options
+    self._experimental_relax_shapes = experimental_relax_shapes
     self._function_cache = FunctionCache()
     self._function_attributes = attributes or {}
     self._capture_by_value = capture_by_value
@@ -1391,40 +1406,25 @@ class Function(object):
       kwargs = {}
     seen_names = set()
     captured = frozenset(graph_function.graph.internal_captures)
-    allowed_positional = 0
-    if args:
-      for outer_arg in args:
-        # TODO(allenl): Consider allowing arguments with defaults in the Python
-        # function's signature to be passed as positional arguments to the
-        # concrete function.
-        if not isinstance(
-            outer_arg,
-            (ops.Tensor, resource_variable_ops.ResourceVariable,
-             tensor_spec.TensorSpec)):
-          break
-        allowed_positional += 1
     # pylint: disable=protected-access
-    graph_function._num_positional_args = allowed_positional
     graph_function._arg_keywords = []
+    prefix_counts = {}
     # pylint: enable=protected-access
+    num_positional = 0
     for arg in graph_function.graph.inputs:
       if arg in captured:
         break
-      user_arg_name = arg.op.get_attr("_user_specified_name")
-      if user_arg_name in seen_names:
-        raise ValueError(
-            ("Unable to construct a concrete function for {} since some "
-             "arguments do not have unique names. Got two arguments named "
-             "'{}'. When constructing a concrete TensorFlow function from a "
-             "Python function which takes nested structures or variadic "
-             "positional arguments, pass unique names to tf.TensorSpec objects "
-             "used to identify these Tensor inputs. These names may then be "
-             "used as keyword arguments to the concrete function.")
-            .format(
-                self._python_function,
-                compat.as_str(arg.op.get_attr("_user_specified_name"))))
-      seen_names.add(user_arg_name)
-      graph_function._arg_keywords.append(user_arg_name)  # pylint: disable=protected-access
+      num_positional += 1
+      user_arg_name = compat.as_str(arg.op.get_attr("_user_specified_name"))
+      proposal = user_arg_name
+      while proposal in seen_names:
+        index = prefix_counts.get(user_arg_name, 1)
+        proposal = "{}_{}".format(user_arg_name, index)
+        prefix_counts[user_arg_name] = index + 1
+      seen_names.add(proposal)
+      graph_function._arg_keywords.append(proposal)  # pylint: disable=protected-access
+    # Anything can be a positional argument, in the same order as .inputs
+    graph_function._num_positional_args = num_positional  # pylint: disable=protected-access
     return graph_function
 
   def __get__(self, instance, owner):
@@ -1552,6 +1552,41 @@ class Function(object):
 
     return graph_function
 
+  def _define_function_with_shape_relaxation(self, args, kwargs):
+    """Define a function, relaxing arg shapes to avoid unecessary retracing."""
+
+    rank_only_cache_key = self._cache_key(
+        args, kwargs, include_tensor_ranks_only=True)
+
+    arg_shapes = _flat_shape_list(args, kwargs)
+    relaxed_arg_shapes = self._function_cache.arg_relaxed_shapes.get(
+        rank_only_cache_key, None)
+    relaxed_arg_function = self._function_cache.arg_relaxed.get(
+        rank_only_cache_key, None)
+
+    if (relaxed_arg_function is not None
+        and _compatible_shapes(flat_relaxed=relaxed_arg_shapes,
+                               flat_to_check=arg_shapes)):
+      return relaxed_arg_function, args, kwargs
+
+    if relaxed_arg_shapes is None:
+      relaxed_arg_shapes = arg_shapes
+    else:
+      if len(arg_shapes) != len(relaxed_arg_shapes):
+        raise RuntimeError("Expected arg_shapes len to match "
+                           "relaxed_arg_shapes len: %d vs. %d"
+                           % (len(arg_shapes), len(relaxed_arg_shapes)))
+      relaxed_arg_shapes = [
+          _common_shape(x, y) for (x, y) in zip(
+              arg_shapes, relaxed_arg_shapes)]
+    self._function_cache.arg_relaxed_shapes[rank_only_cache_key] = (
+        relaxed_arg_shapes)
+    graph_function = self._create_graph_function(
+        args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
+    self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
+
+    return graph_function, args, kwargs
+
   def _maybe_define_function(self, args, kwargs):
     """Gets a function for these inputs, defining it if necessary.
 
@@ -1598,47 +1633,20 @@ class Function(object):
                    kwargs)
 
       call_context_key = cache_key.replace(input_signature=None)
+      # Build a function with shape relaxation retracing if:
+      # 1. shape relaxation is explicitly enabled
+      # and 2. there's no provided input signature
+      # and 3. there's been a cache miss for this calling context
+      if (self._experimental_relax_shapes
+          and self.input_signature is None
+          and call_context_key in self._function_cache.missed):
+        return self._define_function_with_shape_relaxation(args, kwargs)
 
-      # If there's a provided input signature, or
-      # there's no cache miss for this calling context so far, go ahead and
-      # build the function and bypass shape relaxation retracing.
-      if (self.input_signature is not None
-          or call_context_key not in self._function_cache.missed):
-        self._function_cache.missed.add(call_context_key)
+      self._function_cache.missed.add(call_context_key)
+      graph_function = self._function_cache.primary.get(cache_key, None)
+      if graph_function is None:
         graph_function = self._create_graph_function(args, kwargs)
         self._function_cache.primary[cache_key] = graph_function
-        return graph_function, args, kwargs
-
-      rank_only_cache_key = self._cache_key(
-          args, kwargs, include_tensor_ranks_only=True)
-
-      arg_shapes = _flat_shape_list(args, kwargs)
-      relaxed_arg_shapes = self._function_cache.arg_relaxed_shapes.get(
-          rank_only_cache_key, None)
-      relaxed_arg_function = self._function_cache.arg_relaxed.get(
-          rank_only_cache_key, None)
-
-      if (relaxed_arg_function is not None
-          and _compatible_shapes(flat_relaxed=relaxed_arg_shapes,
-                                 flat_to_check=arg_shapes)):
-        return relaxed_arg_function, args, kwargs
-
-      if relaxed_arg_shapes is None:
-        relaxed_arg_shapes = arg_shapes
-      else:
-        if len(arg_shapes) != len(relaxed_arg_shapes):
-          raise RuntimeError("Expected arg_shapes len to match "
-                             "relaxed_arg_shapes len: %d vs. %d"
-                             % (len(arg_shapes), len(relaxed_arg_shapes)))
-        relaxed_arg_shapes = [
-            _common_shape(x, y) for (x, y) in zip(
-                arg_shapes, relaxed_arg_shapes)]
-      self._function_cache.arg_relaxed_shapes[rank_only_cache_key] = (
-          relaxed_arg_shapes)
-      graph_function = self._create_graph_function(
-          args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
-      self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
-
       return graph_function, args, kwargs
 
 
@@ -1678,7 +1686,8 @@ def validate_signature(signature):
 def defun(func=None,
           input_signature=None,
           autograph=True,
-          experimental_autograph_options=None):
+          experimental_autograph_options=None,
+          experimental_relax_shapes=False):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") compiles a Python function
@@ -1995,7 +2004,8 @@ def defun(func=None,
     experimental_autograph_options: Experimental knobs (in the form of a tuple
       of tensorflow.autograph.Feature values) to control behavior when
       autograph=True.
-
+    experimental_relax_shapes: When true, argument shapes may be relaxed to
+      avoid unecessary retracing.
 
   Returns:
      If `func` is not None, returns a callable that will execute the compiled
@@ -2011,14 +2021,16 @@ def defun(func=None,
       func=func,
       input_signature=input_signature,
       autograph=autograph,
-      experimental_autograph_options=experimental_autograph_options)
+      experimental_autograph_options=experimental_autograph_options,
+      experimental_relax_shapes=experimental_relax_shapes)
 
 
 def defun_with_attributes(func=None,
                           input_signature=None,
                           attributes=None,
                           autograph=True,
-                          experimental_autograph_options=None):
+                          experimental_autograph_options=None,
+                          experimental_relax_shapes=False):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
@@ -2038,6 +2050,7 @@ def defun_with_attributes(func=None,
     autograph: same as defun()'s autograph.
     experimental_autograph_options: same as defun()'s
       experimental_autograph_options.
+    experimental_relax_shapes: same as defun()'s experimental_relax_shapes
 
   Returns:
     Same as the return value of defun, with attributes added to the function in
@@ -2063,7 +2076,8 @@ def defun_with_attributes(func=None,
             input_signature=input_signature,
             attributes=attributes,
             autograph=autograph,
-            autograph_options=experimental_autograph_options))
+            autograph_options=experimental_autograph_options,
+            experimental_relax_shapes=experimental_relax_shapes))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
index 08a50a8f513..4e6a60e0d27 100644
--- a/tensorflow/python/eager/function_argument_naming_test.py
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -100,13 +100,6 @@ class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
     self.assertEqual({'alpha', 'beta'},
                      set(fn_op.graph.structured_outputs.keys()))
 
-    with self.assertRaisesRegexp(ValueError, "two arguments named 'z'"):
-      fn.get_concrete_function(
-          z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
-             tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)),
-          y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
-                                   name='custom'),
-          x=4.)
     fn_op2 = fn.get_concrete_function(
         z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
                                   name='z_first'),
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 52ab4c9864b..84c6f529069 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -152,7 +152,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testInputShapeFunctionRelaxation(self):
     unknown_dim = [False]
 
-    @function.defun
+    @function.defun(experimental_relax_shapes=True)
     def func(a):
       if a._shape_tuple()[0] is None:
         unknown_dim[0] = True
@@ -184,7 +184,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   def testNestedInputShapeFunctionRelaxation(self):
     unknown_dim = [False]
 
-    @function.defun
+    @function.defun(experimental_relax_shapes=True)
     def func(a_, b_=None):
       del a_  # Only used to check which cache is used.
       self.assertEqual(b_[0]._shape_tuple(), ())
@@ -223,7 +223,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   def testFunctionRelaxationLosesInnerDimWithKerasLayer(self):
     layer = keras.layers.Dense(1)
-    fn = def_function.function()(layer)
+    fn = def_function.function(experimental_relax_shapes=True)(layer)
 
     with self.captureWritesToStream(sys.stderr) as printed:
       fn(array_ops.ones((3, 2)))
@@ -247,14 +247,14 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     # The inner function will go through shape relaxation because the shapes it
     # receives will be [1], [2], [3], ...
-    @def_function.function
+    @def_function.function(experimental_relax_shapes=True)
     def bar(x_shape):
       got_shape[0] = x_shape._shape_tuple()
       return x_shape
 
     # The outer function will not go through shape relaxation because the shapes
     # it receives will be [1], [[1]], [[[1]]], ...
-    @def_function.function
+    @def_function.function(experimental_relax_shapes=True)
     def foo(ones):
       return bar(array_ops.shape(ones))
 
@@ -385,10 +385,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       ((a, b),) = mats
       return matmul(a, b)
 
-    with self.assertRaisesRegexp(ValueError, "two arguments named 'mats'"):
-      sq.get_concrete_function(
-          [(tensor_spec.TensorSpec((None, None), dtypes.float32),
-            tensor_spec.TensorSpec((None, None), dtypes.float32))])
+    sq_op_autonamed = sq.get_concrete_function(
+        [(tensor_spec.TensorSpec((None, None), dtypes.float32),
+          tensor_spec.TensorSpec((None, None), dtypes.float32))])
+    self.assertEqual([None, None], sq_op_autonamed.output_shapes.as_list())
+
     sq_op = sq.get_concrete_function(
         [(tensor_spec.TensorSpec((None, None), dtypes.float32,
                                  name='first_mat'),
@@ -398,11 +399,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
     t2 = constant_op.constant([[1.4, 2.4], [3.4, 4.4]])
-    with self.assertRaisesRegexp(
-        TypeError, 'bound to Tensors within nested structures'):
-      sq_op(t1, t2)
     out = sq_op(first_mat=t1, second_mat=t2)
     self.assertAllEqual(out, math_ops.matmul(t1, t2).numpy())
+    self.assertAllEqual(sq_op_autonamed(t1, t2),
+                        math_ops.matmul(t1, t2).numpy())
 
   def testExecutingStatelessDefunConcurrently(self):
 
@@ -1405,13 +1405,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined(t)
     self.assertLen(total_function_cache(defined), 2)
 
-  def testCacheTensorUnknownShapesCollision(self):
+  def testCacheTensorUnknownShapesCollisionRelaxedShapes(self):
 
     def func(t):
       return t + t
 
     with context.graph_mode(), self.cached_session():
-      defined = function.defun(func)
+      defined = function.defun(func, experimental_relax_shapes=True)
 
       p = array_ops.placeholder(dtype=dtypes.float32, shape=[])
       defined(p)
@@ -1823,7 +1823,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     # Different rank
     rt5 = ragged_factory_ops.constant([[[1]], [[2]], [[3]]])
-    with self.assertRaisesRegexp(ValueError, 'do not match'):
+    with self.assertRaisesRegexp(ValueError, 'does not match'):
       defined(rt5)
 
   def testTensorKeywordArguments(self):
@@ -2226,6 +2226,40 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         # Grappler fallback to use the CPU impl even called with GPU function.
         self.assertEqual(y_value, 3.0)
 
+  def testSwapImplementationInEager(self):
+    if not context.executing_eagerly():
+      self.skipTest('eager only')
+
+    context.context().set_optimizer_experimental_options(
+        {'min_graph_nodes': -1, 'implementation_selector': True})
+
+    # TODO(b/133178886): Remove _noinline=True once the function is not default
+    # inlined in eager mode with api_implements attribute.
+    @function.defun_with_attributes(
+        attributes={'api_implements': 'foo',
+                    'api_preferred_device': 'CPU',
+                    '_noinline': True})
+    def on_cpu(x):
+      return x + 2
+
+    # TODO(b/133178886): Remove _noinline=True once the function is not default
+    # inlined in eager mode with api_implements attribute.
+    @function.defun_with_attributes(
+        attributes={'api_implements': 'foo',
+                    'api_preferred_device': 'GPU',
+                    '_noinline': True})
+    def on_gpu(x):
+      return x + 4
+
+    @function.defun
+    def run_on_cpu(t):
+      function.register(on_cpu, t)
+      with ops.device('CPU:0'):
+        return on_gpu(t)
+
+    # Expect to run the on_cpu branch, regardless whether gpu is available.
+    self.assertEqual(run_on_cpu(constant_op.constant(1)).numpy(), 3)
+
   def testDefunFunctionSeparateGraphs(self):
     with context.graph_mode():
 
@@ -3015,7 +3049,6 @@ class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
     train()
 
-
 if __name__ == '__main__':
   ops.enable_eager_execution(
       config=config_pb2.ConfigProto(device_count={'CPU': 4}))
diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py
index 46b3fadd7dd..d7fe4d0f975 100644
--- a/tensorflow/python/eager/lift_to_graph.py
+++ b/tensorflow/python/eager/lift_to_graph.py
@@ -40,7 +40,9 @@ def _as_operation(op_or_tensor):
 
 class UnliftableError(Exception):
   """Raised if a Tensor cannot be lifted from the graph."""
-  pass
+
+  # Prevent autograph from rewriting this error.
+  ag_pass_through = True
 
 
 def _constant_inputs(op_or_tensor):
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 0f9da663859..857b477a70b 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "structmember.h"  // NOLINT // For PyMemberDef
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -69,6 +70,56 @@ TFE_TensorHandle* NumpyToTensorHandle(PyObject* obj) {
   }
 }
 
+// Convert a TFE_TensorHandle to a Python numpy.ndarray object.
+// The two may share underlying storage so changes to one may reflect in the
+// other.
+PyObject* TensorHandleToNumpy(TFE_TensorHandle* handle) {
+  auto status = tensorflow::make_safe(TF_NewStatus());
+  const tensorflow::Tensor* t =
+      TFE_TensorHandleUnderlyingTensorInHostMemory(handle, status.get());
+  if (MaybeRaiseExceptionFromTFStatus(status.get(), PyExc_RuntimeError)) {
+    // TODO(slebedev): emit a better error message if a Tensor is on GPU?
+    return nullptr;
+  }
+
+  // HACK(slebedev): The following explains why TensorToNdarray never
+  // reuses the storage.
+  //
+  // TF_TensorToPyArray copies the storage unless its
+  // refcount is 1. For DT_STRING and DT_RESOURCE TF_TensorFromTensor
+  // has to copy so the refcount of the original storage is unchanged.
+  // However, if the storage can be reused by TF_TensorFromTensor its
+  // refcount is +1'd and hence TF_TensorToPyArray no longer can reuse it.
+  //
+  // Here we attempt a direct conversion without an intermediate TF_Tensor
+  // and fall-back to the slow path on failure.
+  PyObject* ret = nullptr;
+  if (t->dtype() != tensorflow::DT_STRING &&
+      t->dtype() != tensorflow::DT_RESOURCE) {
+    tensorflow::gtl::InlinedVector<npy_intp, 4> dims(t->dims());
+    for (int d = 0; d < t->dims(); ++d) {
+      dims[d] = t->dim_size(d);
+    }
+
+    auto* copy = new tensorflow::Tensor(*t);
+    char* data = const_cast<char*>(copy->tensor_data().data());
+    if (tensorflow::ArrayFromMemory(
+            dims.size(), dims.data(), data, t->dtype(), [copy] { delete copy; },
+            &ret)
+            .ok()) {
+      return ret;
+    }
+  }
+
+  auto cppstatus = tensorflow::TensorToNdarray(*t, &ret);
+  if (MaybeRaiseExceptionFromStatus(cppstatus, PyExc_RuntimeError)) {
+    Py_XDECREF(ret);
+    return nullptr;
+  } else {
+    return ret;
+  }
+}
+
 TFE_TensorHandle* CopyToDevice(TFE_TensorHandle* handle, PyObject* ctx,
                                PyObject* dev) {
   const char* device = "";
@@ -599,6 +650,7 @@ static int EagerTensor_settensor_shape(EagerTensor* self, PyObject* value,
   self->tensor_shape = value;
   return 0;
 }
+
 // Function `_copy_to_device`.
 static PyObject* EagerTensor_copy_to_device(EagerTensor* self, PyObject* args,
                                             PyObject* kwds) {
@@ -620,50 +672,10 @@ static PyObject* EagerTensor_copy_to_device(EagerTensor* self, PyObject* args,
 // other.
 // Note that if `self` is not on CPU, we raise an Exception.
 static PyObject* EagerTensor_numpy(EagerTensor* self) {
-  auto status = tensorflow::make_safe(TF_NewStatus());
-  const tensorflow::Tensor* t =
-      TFE_TensorHandleUnderlyingTensorInHostMemory(self->handle, status.get());
-  if (TF_GetCode(status.get()) != TF_OK) {
-    PyErr_SetString(PyExc_RuntimeError, TF_Message(status.get()));
-    return nullptr;
-  }
-
-  // HACK(slebedev): The following explains why TensorToNdarray never
-  // reuses the storage.
-  //
-  // TF_TensorToPyArray copies the storage unless its
-  // refcount is 1. For DT_STRING and DT_RESOURCE TF_TensorFromTensor
-  // has to copy so the refcount of the original storage is unchanged.
-  // However, if the storage can be reused by TF_TensorFromTensor its
-  // refcount is +1'd and hence TF_TensorToPyArray no longer can reuse it.
-  //
-  // Here we attempt a direct conversion without an intermediate TF_Tensor
-  // and fall-back to the slow path on failure.
-  PyObject* ret = nullptr;
-  if (t->dtype() != tensorflow::DT_STRING &&
-      t->dtype() != tensorflow::DT_RESOURCE) {
-    tensorflow::gtl::InlinedVector<npy_intp, 4> dims(t->dims());
-    for (int d = 0; d < t->dims(); ++d) {
-      dims[d] = t->dim_size(d);
-    }
-
-    auto* copy = new tensorflow::Tensor(*t);
-    char* data = const_cast<char*>(copy->tensor_data().data());
-    if (tensorflow::ArrayFromMemory(
-            dims.size(), dims.data(), data, t->dtype(), [copy] { delete copy; },
-            &ret)
-            .ok()) {
-      return ret;
-    }
-  }
-
-  auto cppstatus = tensorflow::TensorToNdarray(*t, &ret);
-  if (MaybeRaiseExceptionFromStatus(cppstatus, PyExc_RuntimeError)) {
-    Py_XDECREF(ret);
-    return nullptr;
-  } else {
-    return ret;
-  }
+  PyObject* ret = TensorHandleToNumpy(self->handle);
+  return (ret == nullptr)
+             ? nullptr
+             : PyArray_Return(reinterpret_cast<PyArrayObject*>(ret));
 }
 
 // Getter `device`.
@@ -737,6 +749,51 @@ static PyMethodDef EagerTensor_methods[] = {
     {nullptr, nullptr},
 };
 
+static int EagerTensor_getbuffer(EagerTensor* self, Py_buffer* view,
+                                 int flags) {
+  if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE) {
+    PyErr_SetString(PyExc_BufferError, "EagerTensor is not writable.");
+    return -1;
+  }
+
+  auto status = tensorflow::make_safe(TF_NewStatus());
+  TFE_TensorHandle* handle =
+      TFE_TensorHandleMaybeCopyToHostCPU(self->handle, status.get());
+  if (TF_GetCode(status.get()) != TF_OK) {
+    PyErr_SetString(PyExc_BufferError,
+                    tensorflow::strings::StrCat("Error copying tensor to CPU:",
+                                                TF_Message(status.get()))
+                        .c_str());
+    return -1;
+  }
+
+  // TensorHandleToNumpy is zero-copy for everything but DT_RESOURCE and
+  // DT_STRING so the following is only slightly slower than a NumPy-free
+  // implementation.
+  auto py_array = tensorflow::make_safe(TensorHandleToNumpy(handle));
+  if (py_array == nullptr ||
+      PyObject_GetBuffer(py_array.get(), view, flags) < 0) {
+    return -1;
+  }
+
+  view->readonly = 1;
+
+  int num_dims = TFE_TensorHandleNumDims(handle, status.get());
+  if (MaybeRaiseExceptionFromTFStatus(status.get(), PyExc_BufferError)) {
+    return -1;
+  }
+  DCHECK(view->ndim == num_dims);
+  return 0;
+}
+
+static PyBufferProcs EagerTensor_as_buffer = {
+#if PY_MAJOR_VERSION < 3
+    nullptr, nullptr, nullptr, nullptr,
+#endif
+    (getbufferproc)EagerTensor_getbuffer,
+    // Never called because getbufferproc delegates to NumPy.
+    (releasebufferproc) nullptr};
+
 // Note that here we are trying to dynamically create a new class as a subclass
 // of a "HEAPTYPE" class that is itself created in python code and passed in at
 // runtime. This is fairly atypical and undocumented.
@@ -764,6 +821,9 @@ static PyType_Slot EagerTensor_Type_slots[] = {
     {0, nullptr},
 };
 #else
+
+#define EAGER_TENSOR_TPFLAGS (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_NEWBUFFER)
+
 // TODO(agarwal): support active_trace.
 static PyTypeObject _EagerTensorType = {
     // clang-format off
@@ -786,8 +846,8 @@ static PyTypeObject _EagerTensorType = {
     nullptr,                            /* tp_str */
     nullptr,                            /* tp_getattro */
     nullptr,                            /* tp_setattro */
-    nullptr,                            /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
+    &EagerTensor_as_buffer,             /* tp_as_buffer */
+    EAGER_TENSOR_TPFLAGS,               /* tp_flags */
     nullptr,                            /* tp_doc */
     nullptr,                            /* tp_traverse */
     nullptr,                            /* tp_clear */
@@ -946,6 +1006,7 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
     return nullptr;
   }
   EagerTensorType->tp_dictoffset = offsetof(EagerTensor, dict);
+  EagerTensorType->tp_as_buffer = &EagerTensor_as_buffer;
 #else
   _EagerTensorType.tp_base = base_class_type;
 
diff --git a/tensorflow/python/eager/remote.py b/tensorflow/python/eager/remote.py
index 7c9344eeba4..a7f1b0980ce 100644
--- a/tensorflow/python/eager/remote.py
+++ b/tensorflow/python/eager/remote.py
@@ -22,6 +22,7 @@ import os
 
 from tensorflow.core.protobuf.cluster_pb2 import ClusterDef
 from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.util.tf_export import tf_export
 
@@ -63,10 +64,14 @@ def connect_to_remote_host(remote_host=None, job_name="worker"):
   if remote_host.startswith(grpc_prefix):
     remote_host = remote_host[len(grpc_prefix):]
 
+  local_port = pywrap_tensorflow.TF_PickUnusedPortOrDie()
+
   cluster_def = ClusterDef()
   job_def = cluster_def.job.add()
   job_def.name = job_name
-  job_def.tasks[0] = "127.0.0.1:0"
+  # TODO(fishx): Update this to make sure remote worker has valid ip address
+  # to connect with local.
+  job_def.tasks[0] = "localhost:{}".format(local_port)
   job_def.tasks[1] = remote_host
 
   server_def = ServerDef(
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
new file mode 100644
index 00000000000..127abac82c0
--- /dev/null
+++ b/tensorflow/python/eager/remote_test.py
@@ -0,0 +1,95 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for remote execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+
+
+class RemoteTest(test.TestCase):
+
+  def setUp(self):
+    super(RemoteTest, self).setUp()
+
+    workers, _ = test_util.create_local_cluster(1, 0)
+    remote.connect_to_remote_host(workers[0].target)
+
+  def testMultiDeviceFunctionBasic(self):
+
+    @def_function.function
+    def basic(i):
+      with ops.device('/job:worker/replica:0/task:0/cpu:0'):
+        a = constant_op.constant([2]) + i
+      with ops.device('/job:worker/replica:0/task:1/cpu:0'):
+        b = constant_op.constant([1])
+
+      return a + b
+
+    self.assertAllEqual(basic(constant_op.constant([2])).numpy(), [5])
+    self.assertAllEqual(basic(constant_op.constant([1])).numpy(), [4])
+
+  def testMultiDeviceFunctionVariable(self):
+    with ops.device('/job:worker/replica:0/task:1/cpu:0'):
+      variable_b = variables.Variable(1)
+
+    @def_function.function
+    def with_variable(i):
+      return i + variable_b
+
+    self.assertAllEqual(with_variable(constant_op.constant([2])).numpy(), [3])
+
+  def testMultiDeviceFunctionRemoteOutput(self):
+    with ops.device('/job:worker/replica:0/task:1/cpu:0'):
+      variable_b = variables.Variable(1)
+
+    @def_function.function
+    def remote_output(i):
+      return variable_b, i + variable_b
+
+    with self.assertRaises(errors.UnimplementedError) as cm:
+      remote_output(constant_op.constant([1]))
+
+    self.assertIn(
+        'Currently, outputting tensors on remote devices is not supported.',
+        cm.exception.message)
+
+  def testMultiDeviceFunctionAmbiguousDevice(self):
+
+    @def_function.function
+    def ambiguous_device(i):
+      with ops.device('cpu:0'):
+        return i + constant_op.constant([2])
+
+    with self.assertRaises(errors.InvalidArgumentError) as cm:
+      with ops.device('/job:worker/replica:0/task:1/cpu:0'):
+        self.assertAllEqual(
+            ambiguous_device(constant_op.constant([2])).numpy(), [3])
+
+    self.assertIn('the output node must match exactly one device',
+                  cm.exception.message)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index b5833718c79..68fc993da4c 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 import copy
 import re
 import sys
+import unittest
 
 import numpy as np
+import six
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
@@ -199,6 +201,16 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     self.assertTrue(bool(_create_tensor([1])))
     self.assertTrue(bool(_create_tensor([1.])))
 
+  @unittest.skipUnless(six.PY2, "long has been removed in PY3")
+  def testLong(self):
+    self.assertEqual(long(_create_tensor(long(42))), 42)
+
+  def testIndex(self):
+    self.assertEqual([42][_create_tensor(0)], 42)
+
+    with self.assertRaises(TypeError):
+      _ = [42][_create_tensor([0])]
+
   def testIntDowncast(self):
     t = _create_tensor(3)
     self.assertEqual(dtypes.int32, t.dtype)
@@ -375,6 +387,34 @@ class TFETensorTest(test_util.TensorFlowTestCase):
       self.assertEqual(
           constant_op.constant(t.min, dtype=t).numpy(), t.min)
 
+  def test_numpyIsView(self):
+    t = constant_op.constant([0.0])
+    t._numpy()[0] = 42.0
+    self.assertAllClose(t, constant_op.constant([42.0]))
+
+  def testMemoryviewIsReadonly(self):
+    t = constant_op.constant([0.0])
+    self.assertTrue(memoryview(t).readonly)
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testMemoryviewScalar(self):
+    t = constant_op.constant(42.0)
+    self.assertAllEqual(
+        np.array(memoryview(t)), np.array(42.0, dtype=np.float32))
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testMemoryviewEmpty(self):
+    t = constant_op.constant([], dtype=np.float32)
+    self.assertAllEqual(np.array(memoryview(t)), np.array([]))
+
+  @test_util.run_gpu_only
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testMemoryviewCopyToCPU(self):
+    with ops.device("/device:GPU:0"):
+      t = constant_op.constant([0.0])
+    self.assertAllEqual(
+        np.array(memoryview(t)), np.array([0.0], dtype=np.float32))
+
 
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
@@ -487,11 +527,6 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
         ValueError, "non-rectangular Python sequence"):
       constant_op.constant(l)
 
-  def test_numpyIsView(self):
-    t = constant_op.constant([0.0])
-    t._numpy()[0] = 42.0
-    self.assertAllClose(t, constant_op.constant([42.0]))
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 17a09378d69..ef731fae4a9 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -245,39 +245,43 @@ class WrappedFunction(function.ConcreteFunction):
     tensor_fetches = []
     tensor_infos = []
 
-    def _fetch_preprocesing_callback(f):
+    def _fetch_preprocesing_callback(fetch):
       """Extract out lists of ops, tensors, and tensor type info.
 
-      Turns TensorInfos into Tensors in the original fetches structure.
+      Turns TensorInfos into Tensors in the original `fetches` structure.
+      Also extracts ops from `fetches`.
 
       Args:
-        f: The fetch to preprocess: Tensor, TensorInfo, or Operation, or string
-          identifying a Tensor or Operation.
+        fetch: The fetch to preprocess: Tensor, TensorInfo, or Operation, or
+          string identifying a Tensor or Operation.
 
       Returns:
-        `f` converted to a Tensor.
+        `fetch` converted to a Tensor.
       """
-      if isinstance(f, ops.Operation):
-        operation_fetches.append(f)
-        return f
-      elif isinstance(f, meta_graph_pb2.TensorInfo):
-        tensor_infos.append(f)
-        decoded = _get_element_from_tensor_info(f, self._func_graph)
+      if isinstance(fetch, ops.Operation):
+        operation_fetches.append(fetch)
+        return fetch
+      elif isinstance(fetch, meta_graph_pb2.TensorInfo):
+        tensor_infos.append(fetch)
+        decoded = _get_element_from_tensor_info(fetch, self._func_graph)
         if tensor_util.is_tensor(decoded):
           tensor_fetches.append(decoded)
         else:
           operation_fetches.append(decoded)
         return decoded
-      elif isinstance(f, ops.Tensor):
-        tensor_fetches.append(f)
-        return f
+      elif isinstance(fetch, ops.Tensor):
+        tensor_fetches.append(fetch)
+        return fetch
       else:
-        graph_element = self.graph.as_graph_element(f)
+        graph_element = self.graph.as_graph_element(fetch)
         return _fetch_preprocesing_callback(graph_element)
 
     fetches = nest.map_structure(_fetch_preprocesing_callback, fetches)
 
-    for f in flat_feeds + tensor_fetches + operation_fetches:
+    # Expand composite tensors into their component dense Tensors.
+    tensor_fetches = nest.flatten(tensor_fetches, expand_composites=True)
+
+    for f in (flat_feeds + tensor_fetches + operation_fetches):
       if f.graph is not self._func_graph:
         raise ValueError("Can only prune function whose feeds and fetches "
                          "are from this graph (%s). Input %s is from graph %s" %
@@ -288,6 +292,10 @@ class WrappedFunction(function.ConcreteFunction):
         operation_fetches + tensor_fetches,
         pruned_graph,
         sources=flat_feeds + internal_captures)
+
+    # Note that we add the component tensors of any composite tensors to the
+    # returned function's outputs list; the list must contain these component
+    # tensors, or the function's sparse outputs won't work properly.
     pruned_graph.outputs.extend(lift_map[x] for x in tensor_fetches)
     pruned_graph.control_outputs.extend(
         [lift_map[operation] for operation in operation_fetches])
@@ -308,13 +316,17 @@ class WrappedFunction(function.ConcreteFunction):
     pruned_graph.variables = self.graph.variables
 
     def _structured_output_mapping(fetched):
+      """callback for `nest.map_structure()`"""
       lifted = lift_map[fetched]
       if isinstance(lifted, ops.Operation):
         return None
       return lifted
 
+    # expand_composites=True here causes composite tensors to be expanded
+    # into their component dense Tensors, mapped to the new graph, and then
+    # reconstituted into their original composite form.
     pruned_graph.structured_outputs = nest.map_structure(
-        _structured_output_mapping, fetches)
+        _structured_output_mapping, fetches, expand_composites=True)
     pruned_graph.structured_input_signature = input_signature
     pruned_fn = WrappedFunction(
         pruned_graph, variable_holder=self._variable_holder)
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index e5485b2b604..66af7865130 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -232,7 +232,10 @@ tf_py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
     ],
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
 )
 
 py_test(
diff --git a/tensorflow/python/framework/composite_tensor_utils.py b/tensorflow/python/framework/composite_tensor_utils.py
index 955fdfb39e2..a4acde0bfd6 100644
--- a/tensorflow/python/framework/composite_tensor_utils.py
+++ b/tensorflow/python/framework/composite_tensor_utils.py
@@ -38,6 +38,15 @@ def is_composite_or_composite_value(tensor):
        ragged_tensor_value.RaggedTensorValue))
 
 
+def get_shape(tensor):
+  """Returns the shape of the passed composite tensor."""
+  if isinstance(tensor, sparse_tensor.SparseTensorValue):
+    # SparseTensorValues use a 'dense_shape' attribute
+    return tensor.dense_shape
+  else:
+    return tensor.shape
+
+
 def _append_sparse_tensor_value(target, to_append):
   """Append sparse tensor value objects."""
   # Make sure the sparse tensors are of the same size (except for the 0th dim).
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index b64407d52f5..eb0371e1449 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -550,6 +551,28 @@ class DeviceTest(test.TestCase):
                      str(gpu_count-1))
     context.context()._physical_devices = None
 
+  def testConfigureCollectiveOps(self):
+    context.context().configure_collective_ops(
+        collective_leader='/job:worker/replica:0/task:0',
+        scoped_allocator_enabled_ops=('CollectiveReduce',),
+        use_nccl_communication=False,
+        device_filters=['/job:worker/task:1'])
+    new_config = context.context().config
+
+    # Verify group leader
+    self.assertEqual('/job:worker/replica:0/task:0',
+                     new_config.experimental.collective_group_leader)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1'], new_config.device_filters)
+
+    # Verify rewrite options.
+    new_rewrite_options = new_config.graph_options.rewrite_options
+    self.assertEqual(rewriter_config_pb2.RewriterConfig.ON,
+                     new_rewrite_options.scoped_allocator_optimization)
+    self.assertEqual(['CollectiveReduce'],
+                     new_rewrite_options.scoped_allocator_opts.enable_op)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 43b713d8ca9..8e1af9f443d 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import variable_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saver import export_meta_graph
 
@@ -43,6 +46,19 @@ def _run_inline_graph_optimization(func):
   meta_graph = export_meta_graph(
       graph_def=func.graph.as_graph_def(), graph=func.graph)
 
+  # Clear the initializer_name for the variables collections, since they are not
+  # needed after saved to saved_model.
+  for name in [
+      "variables", "model_variables", "trainable_variables", "local_variables"
+  ]:
+    raw_list = []
+    for raw in meta_graph.collection_def["variables"].bytes_list.value:
+      variable = variable_pb2.VariableDef()
+      variable.ParseFromString(raw)
+      variable.ClearField("initializer_name")
+      raw_list.append(variable.SerializeToString())
+    meta_graph.collection_def[name].bytes_list.value[:] = raw_list
+
   # Add a collection 'train_op' so that Grappler knows the outputs.
   fetch_collection = meta_graph_pb2.CollectionDef()
   for array in func.inputs + func.outputs:
@@ -123,6 +139,7 @@ def convert_variables_to_constants_v2(func):
   resource_identities = {}
   placeholders = {}
   converted_input_indices = set()
+  reference_variables = []
   for node in graph_def.node:
     if node.name in map_name_to_value:
       # Get the dtype and data for the Placeholders whose values are stored as
@@ -134,6 +151,9 @@ def convert_variables_to_constants_v2(func):
       }
       converted_input_indices.add(
           func.captured_inputs.index(map_name_to_value[node.name]))
+    # Collect the reference variables that cannot be lifted.
+    if node.op == "VariableV2":
+      reference_variables.append(node)
     if node.op == "ReadVariableOp":
       # Get name of Placeholder op associated with ReadVariableOp. There can be
       # an Identity in between the ReadVariableOp and Placeholder. Store the
@@ -158,7 +178,35 @@ def convert_variables_to_constants_v2(func):
   output_graph_def = graph_pb2.GraphDef()
   how_many_converted = 0
 
+  # Add identity node after the reference variable and get the tensor values
+  # for them.
+  if reference_variables:
+    reference_variable_tensors = []
+    with func.graph.as_default():
+      for node in reference_variables:
+        identity_node = array_ops.identity(
+            func.graph.as_graph_element(node.name + ":0"))
+        reference_variable_tensors.append(identity_node.name)
+
+    reference_variable_values = func.prune([], reference_variable_tensors)()
+
+    # Add values of reference variables as constant nodes.
+    for node, value in zip(reference_variables, reference_variable_values):
+      output_node = output_graph_def.node.add()
+      dtype = attr_value_pb2.AttrValue()
+      dtype.type = value.dtype.as_datatype_enum
+
+      output_node.op = "Const"
+      output_node.name = node.name
+      output_node.attr["dtype"].CopyFrom(dtype)
+      output_node.attr["value"].tensor.CopyFrom(
+          tensor_util.make_tensor_proto(value))
+      how_many_converted += 1
+
   for input_node in graph_def.node:
+    # Skip VariableV2 node, since their values are added by the identity nodes.
+    if input_node.op == "VariableV2":
+      continue
     output_node = output_graph_def.node.add()
     # Convert Placeholder ops to Const ops.
     if input_node.name in placeholders:
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
index e6197c5f0e1..2ec340063fd 100644
--- a/tensorflow/python/framework/convert_to_constants_test.py
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -21,12 +21,17 @@ from __future__ import print_function
 import os
 
 from tensorflow.python import keras
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import convert_to_constants
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import simple_save
 from tensorflow.python.saved_model.load import load
 from tensorflow.python.saved_model.save import save
 from tensorflow.python.training.tracking import tracking
@@ -51,9 +56,9 @@ class VariablesToConstantsTest(test.TestCase):
                              input_data):
     # Check that the converted ConcreteFunction produces the same result as the
     # original Function.
-    expected_value = func(input_data)
+    expected_value = nest.flatten(func(input_data))
     actual_value = nest.flatten(converted_concrete_func(input_data))
-    self.assertEqual(expected_value.numpy(), actual_value)
+    self.assertEqual(expected_value[0].numpy(), actual_value)
 
     # Ensure the shape is retained.
     self.assertEqual(converted_concrete_func.inputs[0].shape, input_data.shape)
@@ -65,7 +70,7 @@ class VariablesToConstantsTest(test.TestCase):
     # Load it back and make sure it works.
     loaded_obj = load(save_dir)
     actual_value = nest.flatten(loaded_obj.signatures["mykey"](input_data))
-    self.assertEqual(expected_value.numpy(), actual_value)
+    self.assertEqual(expected_value[0].numpy(), actual_value)
 
   @test_util.run_v2_only
   def testConstSavedModel(self):
@@ -231,6 +236,44 @@ class VariablesToConstantsTest(test.TestCase):
     actual_value = nest.flatten(output_func(input_data))
     self.assertEqual(expected_value.numpy(), actual_value)
 
+  def _v1_single_metagraph_saved_model(self):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[1, 1], dtype=dtypes.float32, name="start")
+      distractor = variables.RefVariable(-1., name="distractor")
+      v = variables.RefVariable(3., name="v")
+      local_variable = variables.VariableV1(
+          1.,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          trainable=False,
+          use_resource=True)
+      output = array_ops.identity(start * v * local_variable, name="output")
+      with session_lib.Session() as session:
+        session.run([v.initializer, distractor.initializer,
+                     local_variable.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": start},
+            outputs={"output": output},
+            legacy_init_op=local_variable.initializer)
+    return path
+
+  @test_util.run_v2_only
+  def test_ref_variable_import(self):
+    saved = self._v1_single_metagraph_saved_model()
+    imported = load(saved)
+    fn = imported.signatures["serving_default"]
+    output_func = convert_to_constants.convert_variables_to_constants_v2(fn)
+    constant_graph_def = output_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    input_data = constant_op.constant(1., shape=[1, 1])
+    root = tracking.AutoTrackable()
+    self._testConvertedFunction(root, fn, output_func, input_data)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 5311f6601ac..68247f0ac87 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -301,6 +301,7 @@ class FuncGraph(ops.Graph):
       old_device_stack = self._device_function_stack
       if context.executing_eagerly():
         if self._distribution_strategy_stack:
+          self._device_function_stack = self._device_function_stack.copy()
           self._add_device_to_stack(context.context().device_name)
       else:
         if (self._distribution_strategy_stack
@@ -690,19 +691,21 @@ def func_graph_from_py_func(name,
         _, original_func = tf_decorator.unwrap(python_func)
 
         def wrapper(*args, **kwargs):
-          # Note: functions annotated with @tf.function should always be
-          # converted even though they would meet autograph's whitelisting
-          # criteria.
-          # If this assumption is ever broken, converted_call will need to
-          # handle the possibility of original_func still being a shim, e.g.
-          # bound to WeakrefSelf.
-          return autograph.converted_call(
-              original_func, None,
-              autograph.ConversionOptions(
-                  recursive=True,
-                  optional_features=autograph_options,
-                  force_conversion=True,
-              ), args, kwargs)
+          """Calls a converted version of original_func."""
+          # TODO(mdan): Push this block higher in tf.function's call stack.
+          try:
+            return autograph.converted_call(
+                original_func, None,
+                autograph.ConversionOptions(
+                    recursive=True,
+                    optional_features=autograph_options,
+                    force_conversion=True,
+                ), args, kwargs)
+          except Exception as e:  # pylint:disable=broad-except
+            if hasattr(e, "ag_error_metadata"):
+              raise e.ag_error_metadata.to_exception(type(e))
+            else:
+              raise
 
         # Wrapping around a decorator allows checks like tf_inspect.getargspec
         # to be accurate.
@@ -755,13 +758,6 @@ def func_graph_from_py_func(name,
   if add_control_dependencies:
     func_graph.control_outputs.extend(control_manager.ops_which_must_run)
 
-# Register any other functions defined in the graph.
-  with ops.init_scope():
-    if context.executing_eagerly():
-      for f in func_graph._functions.values():  # pylint: disable=protected-access
-        # TODO(ashankar): What about the gradient registry?
-        context.add_function(f._c_func.func)  # pylint: disable=protected-access
-
   return func_graph
 
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index d287ea2fcd4..20b8a5608ff 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -191,6 +191,28 @@ class Defun(object):
         **self._extra_kwargs)
 
 
+class _DefinedFunctionDeleter(object):
+  """Unregister function from eager context."""
+
+  def __init__(self, name):
+    self.name = name
+
+  def __del__(self):
+    try:
+      context.remove_function(self.name)
+    except TypeError:
+      # Suppress some exceptions, mainly for the case when we're running on
+      # module deletion. Things that can go wrong include the context module
+      # already being unloaded, self._handle._handle_data no longer being
+      # valid, and so on. Printing warnings in these cases is silly
+      # (exceptions raised from __del__ are printed as warnings to stderr).
+      pass  # 'NoneType' object is not callable when the handle has been
+      # partially unloaded.
+    except AttributeError:
+      pass  # 'NoneType' object has no attribute 'eager_mode' when context has
+      # been unloaded. Will catch other module unloads as well.
+
+
 class _DefinedFunction(object):
   """_DefinedFunction encapsulates a function definition and its properties.
 
@@ -262,6 +284,7 @@ class _DefinedFunction(object):
     self._definition = None
     # Constructed only when C API is enabled, lazily
     self._c_func = None
+    self._function_deleter = None
     self._sub_functions = {}  # Constructed with _definition or _c_func
     # pylint: disable=protected-access
     device_funcs = ops.get_default_graph()._device_functions_outer_to_inner
@@ -297,6 +320,11 @@ class _DefinedFunction(object):
         fdef = function_pb2.FunctionDef()
         proto_data = c_api.TF_GetBuffer(buf)
         fdef.ParseFromString(compat.as_bytes(proto_data))
+        with ops.init_scope():
+          if context.executing_eagerly():
+            context.add_function(self._c_func.func)
+            self._function_deleter = _DefinedFunctionDeleter(
+                fdef.signature.name)
       return fdef
     return self._definition
 
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 57f50b888f5..7f679a4d023 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -1610,6 +1610,7 @@ class UnrollLSTMTest(test.TestCase):
 
 class FunctionInlineControlTest(test.TestCase):
 
+  @test_util.disable_xla("XLA changes the names, breaking graph analysis")
   def testFoo(self):
     dtype = dtypes.float32
     cfg = config_pb2.ConfigProto(
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 30c820a6020..f04fb6cf874 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -763,16 +763,26 @@ class _EagerTensorBase(Tensor):
 
   # __int__, __float__ and __index__ may copy the tensor to CPU and
   # only work for scalars; values are cast as per numpy.
+  # TODO(slebedev): avoid redundant copy in all of the following methods.
   def __int__(self):
     return int(self.numpy())
 
+  def __long__(self):
+    return long(self.numpy())
+
   def __float__(self):
     return float(self.numpy())
 
   def __index__(self):
-    return int(self.numpy())
+    maybe_arr = self.numpy()
+    if isinstance(maybe_arr, np.ndarray):
+      return maybe_arr.__index__()
+    return int(maybe_arr)  # Must be a NumPy scalar.
 
   def __array__(self, dtype=None):
+    # This is only called if the buffer interface conversion failed.
+    # Remove once numpy/numpy#13507 is merged and released or py_function
+    # creates EagerTensors with a non-nullptr context.
     return np.asarray(self.numpy(), dtype=dtype)
 
   def __format__(self, format_spec):
@@ -3508,13 +3518,6 @@ class Graph(object):
 
     # Add function to graph
     # pylint: disable=protected-access
-    # Handle functions created without using the C API. TODO(apassos,skyewm)
-    # remove this when all functions are generated using the C API by default
-    # as this will be unnecessary.
-    if not function._c_func:
-      serialized = function.definition.SerializeToString()
-      c_func = c_api.TF_FunctionImportFunctionDef(serialized)
-      function._c_func = c_api_util.ScopedTFFunction(c_func)
     gradient = (
         function._grad_func._c_func.func if function._grad_func else None)
     c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index c8338a344dc..d45428d7ca0 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -15,8 +15,11 @@ limitations under the License.
 #include "tensorflow/python/framework/python_op_gen.h"
 
 #include <stdio.h>
+
 #include <sstream>
 #include <unordered_map>
+
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
@@ -108,7 +111,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
                    const string& function_name)
       : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
     op_name_ = function_name_;
-    str_util::ConsumePrefix(&op_name_, "_");
+    absl::ConsumePrefix(&op_name_, "_");
   }
   ~GenEagerPythonOp() override {}
 
@@ -487,7 +490,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
       strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
                          " = ", default_value, "\n");
     }
-    if (str_util::StartsWith(attr_type, "list(")) {
+    if (absl::StartsWith(attr_type, "list(")) {
       ExpectListArg(indentation, attr_api_name, function_setup);
     }
 
@@ -1071,11 +1074,11 @@ from tensorflow.tools.docs import doc_controls as _doc_controls
 
   result.append("# ");
   auto ops_text = ProtoDebugString(cleaned_ops);
-  str_util::StripTrailingWhitespace(&ops_text);
+  absl::StripTrailingAsciiWhitespace(&ops_text);
   result.append(str_util::StringReplace(ops_text, "\n", "\n# ", true));
   result.append("\n");
   strings::Appendf(&result, "_op_def_lib = _InitOpDefLibrary(b\"%s\")\n",
-                   str_util::CEscape(cleaned_ops.SerializeAsString()).c_str());
+                   absl::CEscape(cleaned_ops.SerializeAsString()).c_str());
   return result;
 }
 
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index cbdeecfbfb9..bd8509ba6fa 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -17,18 +17,21 @@ limitations under the License.
 
 #include <float.h>
 #include <stdio.h>
+
 #include <iomanip>
 #include <sstream>
 #include <unordered_map>
+
+#include "absl/strings/escaping.h"
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -116,7 +119,7 @@ string AvoidPythonReserved(const string& s) {
 string Indent(int initial, int rest, StringPiece in) {
   // TODO(josh11b): Also word-wrapping?
   string copy(in.data(), in.size());
-  str_util::StripTrailingWhitespace(&copy);
+  absl::StripTrailingAsciiWhitespace(&copy);
   std::vector<string> v = str_util::Split(copy, '\n');
 
   string result;
@@ -319,7 +322,7 @@ string GetReturns(const OpDef& op_def,
         }
       }
       strings::StrAppend(&result, "    A tuple of `Tensor` objects (",
-                         str_util::Join(out_names, ", "), ").\n\n");
+                         absl::StrJoin(out_names, ", "), ").\n\n");
       for (int i = 0; i < num_outs; ++i) {
         string desc = strings::StrCat(out_names[i], ": ");
         StringPiece description = op_def.output_arg(i).description();
@@ -350,7 +353,7 @@ string GetReturns(const OpDef& op_def,
 }
 
 string StringToPython(const string& str) {
-  return strings::StrCat("\"", str_util::CEscape(str), "\"");
+  return strings::StrCat("\"", absl::CEscape(str), "\"");
 }
 
 string DataTypeToPython(DataType dtype, const string& dtype_module) {
@@ -457,7 +460,7 @@ string AttrValueToPython(const string& type, const AttrValue& value,
     return TensorToPython(value.tensor());
   } else if (type == "func") {
     return StringToPython(value.func().name());
-  } else if (str_util::StartsWith(type, "list(")) {
+  } else if (absl::StartsWith(type, "list(")) {
     return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
   } else {
     return "?";
@@ -771,7 +774,7 @@ void GenPythonOp::AddOutputGlobals() {
       }
     }
     string out_names_list =
-        strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
+        strings::StrCat("[\"", absl::StrJoin(out_names, "\", \""), "\"]");
 
     // Provide the output names as a Python list
     string lower_op_name_outputs =
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index e20ad5fd339..872d4990bd9 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -96,7 +96,7 @@ string InferSourceFileName(const char* argv_zero) {
   // operators defined in <op type>_ops.cc
   const char* kExecPrefix = "gen_";
   const char* kExecSuffix = "_py_wrappers_cc";
-  if (str_util::ConsumePrefix(&command_str, kExecPrefix) &&
+  if (absl::ConsumePrefix(&command_str, kExecPrefix) &&
       str_util::EndsWith(command_str, kExecSuffix)) {
     command_str.remove_suffix(strlen(kExecSuffix));
     return strings::StrCat(command_str, ".cc");
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 3d24a308eb8..544190e23d5 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -86,7 +86,7 @@ from tensorflow.python.util.protobuf import compare
 from tensorflow.python.util.tf_export import tf_export
 
 
-# If the above import is made available through the BUILD rule, then this
+# If the below import is made available through the BUILD rule, then this
 # function is overridden and will instead return True and cause Tensorflow
 # graphs to be compiled with XLA.
 def is_xla_enabled():
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index bedd9a2b89b..4df38c6f9c3 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 
 from tensorflow.python.client import session
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
@@ -340,46 +341,50 @@ class AutoMixedPrecisionTest(test.TestCase):
   @test_util.run_deprecated_v1
   def test_conv_bn(self):
     """Test graph with convolution followed by batch norm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      x = _conv_bn(x)
-      output = _conv_bn(x)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      if test.is_gpu_available(cuda_only=True):
+        random_seed.set_random_seed(0)
+        x = _input([2, 8, 8, 1])
+        x = _conv_bn(x)
+        output = _conv_bn(x)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+        output_val_ref, output_val, cost_graph = self._run(output)
+        node_map = _build_node_map(cost_graph.node)
+        num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'FusedBatchNorm')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
-      self.assertEqual(num_to_fp16, 3)  # Before Conv2D:0, Conv2D:1, Conv2D_1:1
-      self.assertEqual(num_to_fp32, 1)  # After FusedBatchNorm:0
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+        self._assert_output_fp16(node_map, 'Conv2D')
+        self._assert_output_fp16(node_map, 'FusedBatchNormV3')
+        self._assert_output_fp16(node_map, 'Conv2D_1')
+        self.assertEqual(num_to_fp16,
+                         3)  # Before Conv2D:0, Conv2D:1, Conv2D_1:1
+        self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+        self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
   @test_util.run_deprecated_v1
   def test_conv_bn_dropout(self):
     """Test dropout precision of convolution batch norm graph."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      y = _conv_bn(x)
-      y = nn.dropout(y, rate=0.5)
-      y = _conv_bn(y)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      if test.is_gpu_available(cuda_only=True):
+        random_seed.set_random_seed(0)
+        x = _input([2, 8, 8, 1])
+        y = _conv_bn(x)
+        y = nn.dropout(y, rate=0.5)
+        y = _conv_bn(y)
+        y = array_ops.identity(y)
+        optimizer = gradient_descent.GradientDescentOptimizer(
+            learning_rate=0.01)
+        g = optimizer.compute_gradients(y, [x])
+        output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'FusedBatchNorm')
-      self._assert_output_fp16(node_map, 'dropout/mul')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
+        output_val_ref, output_val, cost_graph = self._run(output)
+        node_map = _build_node_map(cost_graph.node)
+        self._assert_output_fp16(node_map, 'Conv2D')
+        self._assert_output_fp16(node_map, 'FusedBatchNormV3')
+        self._assert_output_fp16(node_map, 'dropout/mul')
+        self._assert_output_fp16(node_map, 'Conv2D_1')
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+        output_val_ref, output_val, cost_graph = self._run(output)
+        self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
   @test_util.run_deprecated_v1
   def test_conv_pool(self):
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 7a7761b5174..b4d73bfad0d 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import device_properties_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import session
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -1456,28 +1457,29 @@ class LayoutOptimizerTest(test.TestCase):
 
   @test_util.deprecated_graph_mode_only
   def testBinaryOpSecondPort(self):
-    if test.is_gpu_available(cuda_only=True):
-      output = _model_with_second_port()
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      if test.is_gpu_available(cuda_only=True):
+        output = _model_with_second_port()
 
-      with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = self.evaluate(output)
+        with session.Session(config=_get_config(False)) as sess:
+          output_val_ref = self.evaluate(output)
 
-      with session.Session(config=_get_config()) as sess:
-        metadata = config_pb2.RunMetadata()
-        output_val = sess.run(output, run_metadata=metadata)
+        with session.Session(config=_get_config()) as sess:
+          metadata = config_pb2.RunMetadata()
+          output_val = sess.run(output, run_metadata=metadata)
 
-      nodes = []
-      num_transposes = 0
-      for node in metadata.cost_graph.node:
-        if _is_transpose(node.name):
-          num_transposes += 1
-        nodes.append(node.name)
+        nodes = []
+        num_transposes = 0
+        for node in metadata.cost_graph.node:
+          if _is_transpose(node.name):
+            num_transposes += 1
+          nodes.append(node.name)
 
-      expected_num_transposes = 2
-      self.assertEqual(expected_num_transposes, num_transposes)
-      self._assert_trans_nhwc_to_nchw('FusedBatchNorm-0', nodes)
-      self._assert_trans_nchw_to_nhwc('Add-0-0', nodes)
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+        expected_num_transposes = 2
+        self.assertEqual(expected_num_transposes, num_transposes)
+        self._assert_trans_nhwc_to_nchw('FusedBatchNormV3-0', nodes)
+        self._assert_trans_nchw_to_nhwc('Add-0-0', nodes)
+        self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   @test_util.deprecated_graph_mode_only
   def testGradient(self):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index e286fa9f2d1..eb36b48dcb8 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -5,9 +5,10 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow/core:platform/default/distribute.bzl", "distribute_py_test")
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -91,6 +92,7 @@ py_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:composite_tensor_utils",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:ctc_ops",
@@ -1246,13 +1248,14 @@ tf_py_test(
 
 tf_py_test(
     name = "training_arrays_test",
-    size = "small",
+    size = "medium",
     srcs = ["engine/training_arrays_test.py"],
     additional_deps = [
         ":keras",
         ":layers",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "@six_archive//:six",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:client_testlib",
     ],
@@ -1439,6 +1442,7 @@ tf_py_test(
         "//tensorflow/python:util",
     ],
     shard_count = 4,
+    tags = ["no_oss"],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/keras/api/BUILD b/tensorflow/python/keras/api/BUILD
index 259cadcb321..77a60ccb276 100644
--- a/tensorflow/python/keras/api/BUILD
+++ b/tensorflow/python/keras/api/BUILD
@@ -1,9 +1,10 @@
 # Description:
 # Package for TensorFlow.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0 License
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0 License
+)
 
 load("//tensorflow/python/tools/api/generator:api_gen.bzl", "gen_api_init_files")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "KERAS_API_INIT_FILES")
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 263ded9c795..0532a0d5ba4 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -37,6 +37,7 @@ from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor_utils
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import composite_tensor
@@ -3023,7 +3024,8 @@ def set_value(x, value):
   """
   value = np.asarray(value, dtype=dtype(x))
   if ops.executing_eagerly_outside_functions():
-    x.assign(value)
+    with ops.init_scope():
+      x.assign(value)
   else:
     with get_graph().as_default():
       tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
@@ -3047,8 +3049,9 @@ def batch_set_value(tuples):
           `value` should be a Numpy array.
   """
   if ops.executing_eagerly_outside_functions():
-    for x, value in tuples:
-      x.assign(np.asarray(value, dtype=dtype(x)))
+    with ops.init_scope():
+      for x, value in tuples:
+        x.assign(np.asarray(value, dtype=dtype(x)))
   else:
     with get_graph().as_default():
       if tuples:
@@ -3102,6 +3105,61 @@ def print_tensor(x, message=''):
     return x
 
 
+def is_tensor_or_composite_tensor(value):
+  """Test if a passed value object is a tensor-like or composite tensor."""
+  return (tensor_util.is_tensor(value) or isinstance(value, np.ndarray) or
+          composite_tensor_utils.is_composite_or_composite_value(value))
+
+
+def _try_process_scipy_sparse_input(value):
+  """Converts 'value' to a SparseTensor if it is a scipy sparse matrix.
+
+  Arguments:
+    value: An object that may have the attributes of a scipy sparse matrix.
+
+  Returns:
+    Either a SparseTensor based off of 'value' or 'value' itself.
+  """
+  try:
+    sparse_coo = value.tocoo()
+    row, col = sparse_coo.row, sparse_coo.col
+    data, shape = sparse_coo.data, sparse_coo.shape
+  except AttributeError:
+    # If we can't convert this object, it could be either a single data
+    # element (ie, a bool/int/float) which is OK to pass on, or something
+    # that we don't understand (which may or may not be OK). In either
+    # case, don't die here: the data standardization code will catch
+    # those issues.
+    return value
+
+  indices = np.concatenate((np.expand_dims(row, 1), np.expand_dims(col, 1)), 1)
+  return sparse_tensor.SparseTensor(indices, data, shape)
+
+
+def try_convert_scipy_to_sparse(values):
+  """Converts scipy sparse matrices in 'values' to SparseTensors, if possible.
+
+  Arguments:
+    values: An input or list of inputs to convert. These may be TensorLikes,
+      ndarrays, composite tensors, or scipy sparse values.
+
+  Returns:
+    An input or list of inputs where scipy sparse tensors have been converted
+    to tf.SparseTensors.
+
+  Raises:
+    ValueError: If input cannot be converted to a SparseTensor.
+  """
+  # Convert scipy sparse data into sparse tensors.
+  value_structure = values
+  values = nest.flatten(values)
+  for idx, value in enumerate(values):
+    if not is_tensor_or_composite_tensor(value):
+      values[idx] = _try_process_scipy_sparse_input(value)
+  values = nest.pack_sequence_as(value_structure, values)
+
+  return values
+
 
 # GRAPH MANIPULATION
 
@@ -3132,7 +3190,8 @@ class GraphExecutionFunction(object):
     if not isinstance(updates, (list, tuple)):
       raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
-    self.inputs = nest.flatten(inputs)
+    self._inputs_structure = inputs
+    self.inputs = nest.flatten(inputs, expand_composites=True)
     self._outputs_structure = outputs
     self.outputs = cast_variables_to_tensor(
         nest.flatten(outputs, expand_composites=True))
@@ -3248,7 +3307,11 @@ class GraphExecutionFunction(object):
       return tensor
 
   def __call__(self, inputs):
-    inputs = nest.flatten(inputs)
+    inputs = try_convert_scipy_to_sparse(inputs)
+
+    # Ensure that input value types match any expected composite tensor types.
+    # TODO(momernick): Once TensorSpecs are implemented for CTs, use that here.
+    inputs = nest.flatten(inputs, expand_composites=True)
 
     session = get_session(inputs)
     feed_arrays = []
@@ -3258,11 +3321,7 @@ class GraphExecutionFunction(object):
     for tensor, value in zip(self.inputs, inputs):
       if value is None:
         continue
-      if is_sparse(tensor):
-        sparse_coo = value.tocoo()
-        indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
-                                  np.expand_dims(sparse_coo.col, 1)), 1)
-        value = (indices, sparse_coo.data, sparse_coo.shape)
+
       if tensor_util.is_tensor(value):
         # Case: feeding symbolic tensor.
         feed_symbols.append(tensor)
@@ -3317,8 +3376,9 @@ class EagerExecutionFunction(object):
 
   def __init__(self, inputs, outputs, updates=None, name=None):
     self.name = name
+    self._inputs_structure = inputs
+    inputs = nest.flatten(inputs, expand_composites=True)
     self._outputs_structure = outputs
-    inputs = nest.flatten(inputs)
     outputs = nest.flatten(outputs, expand_composites=True)
 
     updates = updates or []
@@ -3330,9 +3390,11 @@ class EagerExecutionFunction(object):
       # Edge case; never happens in practice
       raise ValueError('Cannot create a Keras backend function with updates'
                        ' but no outputs during eager execution.')
-
-    graphs = {i.graph for i in nest.flatten([inputs, outputs, updates])
-              if hasattr(i, 'graph')}
+    graphs = {
+        i.graph
+        for i in nest.flatten([inputs, outputs, updates])
+        if hasattr(i, 'graph')
+    }
     if len(graphs) > 1:
       raise ValueError('Cannot create an execution function which is comprised '
                        'of elements from multiple graphs.')
@@ -3422,7 +3484,10 @@ class EagerExecutionFunction(object):
               x.op.inputs[0])
 
   def __call__(self, inputs):
-    input_values = nest.flatten(inputs)
+    # Convert scipy sparse data into sparse tensors.
+    inputs = try_convert_scipy_to_sparse(inputs)
+
+    input_values = nest.flatten(inputs, expand_composites=True)
     if self._freezable_vars_values:
       input_values = input_values + self._freezable_vars_values
     converted_inputs = []
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index d12cce14023..715fbcff040 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -5,9 +5,10 @@
 load("//tensorflow/core:platform/default/distribute.bzl", "distribute_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -138,7 +139,6 @@ distribute_py_test(
     shard_count = 19,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # b/129793413
         "no_windows_gpu",
         "notsan",
     ],
@@ -156,7 +156,6 @@ distribute_py_test(
     shard_count = 4,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # b/129793413
         "no_windows_gpu",
         "notsan",
     ],
@@ -174,7 +173,6 @@ distribute_py_test(
     shard_count = 8,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # b/129793413
         "no_windows_gpu",
         "notsan",
     ],
@@ -194,7 +192,6 @@ distribute_py_test(
     shard_count = 31,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # b/129793413
         "no_windows_gpu",
         "notap",  # b/131937016
         "notsan",
@@ -230,7 +227,6 @@ distribute_py_test(
     shard_count = 4,
     tags = [
         "multi_and_single_gpu",
-        "no_oss",  # b/129793413
         "no_windows_gpu",
         "notsan",
     ],
@@ -272,6 +268,7 @@ cuda_py_test(
         "no_oss",  # http://b/119349471
         "tf_integration_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -307,6 +304,7 @@ cuda_py_test(
         "no_oss",  # TODO(b/130369494): Investigate why it times out on OSS.
         # TODO(b/123307453): Add "multi_and_single_gpu",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -328,6 +326,7 @@ cuda_py_test(
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/132384649): Flakily times out.
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -345,6 +344,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index 05b939ffbd6..40808d6a52e 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -1193,12 +1193,14 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(
       combinations.combine(
           distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.one_device_strategy
           ],
           mode=['graph', 'eager'], cloning=[True, False]))
-  # TODO(b/120943676, b/120957836): Re-enable once the validation code is
-  # restored.
-  def DISABLED_test_dataset_wrong_input_shape(self, distribution, cloning):
+  def test_dataset_wrong_input_shape(self, distribution, cloning, mode):
+    if cloning or mode == 'graph':
+      self.skipTest('TODO(b/120943676, b/120957836): Re-enable for cloning=True'
+                    ' once the validation code is restored.')
     with self.cached_session():
       with distribution.scope():
         # TODO(b/130808953): Re-enable the V1 optimizer after iterations is
@@ -1229,10 +1231,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           ],
           mode=['graph', 'eager'],
           cloning=[True, False]))
-  # TODO(b/120943676, b/120957836): Re-enable once the validation code is
-  # restored.
-  def DISABLED_test_dataset_no_batch_input_validation(self, distribution,
-                                                      cloning):
+  def test_dataset_no_batch_input_validation(self, distribution,
+                                             cloning, mode):
+    if cloning or mode == 'graph':
+      self.skipTest('TODO(b/120943676, b/120957836): Re-enable for cloning=True'
+                    ' once the validation code is restored.')
     with self.cached_session():
       with distribution.scope():
         model = get_model()
@@ -1241,12 +1244,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         model.compile(optimizer, loss, cloning=cloning)
 
       # User forgets to batch the dataset
-      inputs = np.zeros((10, 3), dtype=np.float32)
+      inputs = np.zeros((10, 6), dtype=np.float32)
       targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
 
-      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
+      with self.assertRaisesRegexp(ValueError, 'Call.*batch.*on.*Dataset'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(
diff --git a/tensorflow/python/keras/distribute/keras_correctness_test_base.py b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
index c1c3f0ea91d..197596c0de9 100644
--- a/tensorflow/python/keras/distribute/keras_correctness_test_base.py
+++ b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
@@ -434,6 +434,13 @@ class TestDistributionStrategyCorrectnessBase(test.TestCase,
                            is_stateful_model=False,
                            partial_last_batch=None,
                            training_epochs=2):
+    # These tests pass in Google's internal build, but certain combinations
+    # fail in some of our open source builds. This next line is automatically
+    # rewritten by our conversion script.
+    in_tf_open_source = True
+    if (use_validation_data and not context.executing_eagerly() and
+        in_tf_open_source and distribution.num_replicas_in_sync > 1):
+      self.skipTest('Test broken; see b/129793413 and b/117920141')
     with self.cached_session():
       self.set_up_test_config(use_numpy, use_validation_data, with_batch_norm)
       self.skip_unsupported_test_configuration(distribution)
diff --git a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
index 5971cd12136..12d966d9b30 100644
--- a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
@@ -117,9 +117,9 @@ class TestDistributionStrategyDnnCorrectness(
   @combinations.generate(
       keras_correctness_test_base
       .strategy_minus_tpu_and_input_config_combinations_eager())
-  def test_dnn_correctness_with_partial_last_batch(self, distribution,
-                                                   use_numpy,
-                                                   use_validation_data):
+  def test_dnn_correctness_with_partial_last_batch(
+      self, distribution, use_numpy, use_validation_data):
+    distribution.extended.experimental_enable_get_next_as_optional = True
     self.run_correctness_test(
         distribution,
         use_numpy,
diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index ae2c762fe42..3b324f1be2a 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -75,6 +75,13 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
       combinations.times(keras_test_lib.all_strategy_combinations(),
                          combinations.combine(cloning=[True, False])))
   def test_callbacks_in_fit(self, distribution, cloning):
+    # These tests pass in Google's internal build, but certain combinations
+    # fail in some of our open source builds. This next line is automatically
+    # rewritten by our conversion script.
+    in_tf_open_source = True
+    if (not context.executing_eagerly() and in_tf_open_source and
+        distribution.num_replicas_in_sync > 1):
+      self.skipTest('Test broken; see b/129793413 and b/117920141')
     with distribution.scope():
       model = keras_test_lib.get_model()
       model.compile(
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_test.py
index 13a44ced64b..b0744414df8 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_test_base as test_base
+from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
@@ -210,7 +211,8 @@ class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
 
       test_obj.assertAllClose(
           history_after_one_more_epoch.history,
-          history_after_loading_weight_and_one_more_epoch.history)
+          history_after_loading_weight_and_one_more_epoch.history,
+          rtol=6e-6)
 
     # Verify the temp files are indeed removed (no trace left behind).
     for filepath in filepaths:
@@ -242,7 +244,8 @@ class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
     # restoring are closed.
     test_obj.assertAllClose(
         history_after_one_more_epoch.history,
-        history_after_model_restoring_and_one_more_epoch.history)
+        history_after_model_restoring_and_one_more_epoch.history,
+        rtol=5e-6)
 
     history_one_more_epoch_without_model_restoring = model.fit(
         x=train_ds, epochs=1, steps_per_epoch=steps)
@@ -294,6 +297,75 @@ class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
                   load_weights_on_restart=True)
           ])
 
+  @staticmethod
+  def callableForTestReduceLROnPlateau(model, test_obj, train_ds, num_epoch,
+                                       steps, strategy, saving_filepath):
+
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='loss',
+            factor=0.1,
+            min_delta=1,
+            patience=1,
+            cooldown=5,
+            verbose=1)
+    ]
+
+    # It is expected that the learning rate would drop by `factor` within
+    # 3 epochs with `min_delta=1`.
+    model.fit(x=train_ds, epochs=3, steps_per_epoch=steps, callbacks=cbks)
+    test_obj.assertAllClose(
+        float(K.get_value(model.optimizer.lr)), 0.0001, atol=1e-8)
+
+    # It is expected that the learning rate would drop by another `factor`
+    # within 3 epochs with `min_delta=1`.
+    model.fit(x=train_ds, epochs=3, steps_per_epoch=steps, callbacks=cbks)
+    test_obj.assertAllClose(
+        float(K.get_value(model.optimizer.lr)), 0.00001, atol=1e-8)
+
+  @staticmethod
+  def callableForTestEarlyStopping(model, test_obj, train_ds, num_epoch, steps,
+                                   strategy, saving_filepath):
+
+    class EpochCounterCallback(callbacks.Callback):
+
+      def on_epoch_begin(self, epoch, logs):
+        self.last_epoch = epoch
+
+    epoch_counter_cbk = EpochCounterCallback()
+    cbks = [
+        callbacks.EarlyStopping(
+            monitor='loss', min_delta=0.05, patience=1, verbose=1),
+        epoch_counter_cbk
+    ]
+
+    # Empirically, it is expected that `model.fit()` would terminate around the
+    # 22th epoch. Asserting that it should have been stopped before the 50th
+    # epoch to avoid flakiness and be more predictable.
+    model.fit(x=train_ds, epochs=100, steps_per_epoch=steps, callbacks=cbks)
+    test_obj.assertLess(epoch_counter_cbk.last_epoch, 50)
+
+  @staticmethod
+  def callableForTestLearningRateScheduler(model, test_obj, train_ds, num_epoch,
+                                           steps, strategy, saving_filepath):
+
+    cbks = [
+        callbacks.LearningRateScheduler(
+            schedule=lambda x: 1. / (1. + x), verbose=1)
+    ]
+
+    # It is expected that with `epochs=2`, the learning rate would drop to
+    # 1 / (1 + 2) = 0.5.
+    model.fit(x=train_ds, epochs=2, steps_per_epoch=steps, callbacks=cbks)
+    test_obj.assertAllClose(
+        float(K.get_value(model.optimizer.lr)), 0.5, atol=1e-8)
+
+    # It is expected that with `epochs=4`, the learning rate would drop to
+    # 1 / (1 + 4) = 0.25.
+    model.fit(x=train_ds, epochs=4, steps_per_epoch=steps, callbacks=cbks)
+    test_obj.assertAllClose(
+        float(K.get_value(model.optimizer.lr)), 0.25, atol=1e-8)
+
   class PreemptionAtBatchBoundarySimulatingCallback(callbacks.Callback):
     """Callback to simulate preemtion at batch boundary."""
 
@@ -536,7 +608,7 @@ class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
     def assert_all_elements_are_identical(list_to_check):
       first_item = list_to_check[0]
       for item in list_to_check[1:]:
-        self.assertAllClose(first_item, item, rtol=1e-5, atol=1e-5)
+        self.assertAllClose(first_item, item, rtol=2e-5, atol=1e-5)
 
     # Important: the results from preemption interrupted and non-interrupted
     # cases should give the same final results.
@@ -559,6 +631,12 @@ class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
       callableForTestModelRestoreCallback.__func__)
   test_unmatched_model_file = generate_callback_test_function(
       callableForTestUnmatchedModelFile.__func__)
+  test_reduce_lr_on_plateau = generate_callback_test_function(
+      callableForTestReduceLROnPlateau.__func__)
+  test_early_stopping = generate_callback_test_function(
+      callableForTestEarlyStopping.__func__)
+  test_learning_rate_scheduler = generate_callback_test_function(
+      callableForTestLearningRateScheduler.__func__)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 86c0e00f8b9..fb122080026 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -62,6 +62,7 @@ from tensorflow.python.training.tracking import layer_utils as trackable_layer_u
 from tensorflow.python.training.tracking import object_identity
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -574,10 +575,10 @@ class Layer(module.Module):
     # We are clearing the losses only on the top level model call and not on
     # every layer/mode call because layer/model may be reused.
     if (base_layer_utils.is_in_eager_or_tf_function() and
-        not base_layer_utils.is_in_call_context()):
+        not base_layer_utils.call_context().in_call):
       self._clear_losses()
 
-    with base_layer_utils.call_context(self):
+    with base_layer_utils.call_context().enter(self, inputs, build_graph):
       # Check input assumptions set after layer building, e.g. input shape.
       if build_graph:
         # Symbolic execution on symbolic tensors. We will attempt to build
@@ -680,14 +681,6 @@ class Layer(module.Module):
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, previous_mask)
 
-    if not context.executing_eagerly():
-      # Optionally load weight values specified at layer instantiation.
-      # TODO(fchollet): consider enabling this with eager execution too.
-      if (hasattr(self, '_initial_weights') and
-          self._initial_weights is not None):
-        self.set_weights(self._initial_weights)
-        del self._initial_weights
-
     return outputs
 
   @property
@@ -874,7 +867,6 @@ class Layer(module.Module):
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tensor(loss):
         loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
-      base_layer_utils.check_graph_consistency(loss, method='add_loss')
       loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
       return loss
 
@@ -895,20 +887,21 @@ class Layer(module.Module):
       if (tf_utils.is_symbolic_tensor(loss) and
           not base_layer_utils.is_in_tf_function()):
         symbolic_losses.append(_tag_unconditional(loss))
+        base_layer_utils.check_graph_consistency(loss, method='add_loss')
       elif tensor_util.is_tensor(loss):
         eager_losses.append(_tag_unconditional(loss))
 
     self._callable_losses += callable_losses
 
-    call_context = base_layer_utils.is_in_call_context()
-    if eager_losses and not call_context:
+    in_call_context = base_layer_utils.call_context().in_call
+    if eager_losses and not in_call_context:
       raise ValueError(
           'Expected a symbolic Tensors or a callable for the loss value. '
           'Please wrap your loss computation in a zero argument `lambda`.')
 
     self._eager_losses += eager_losses
 
-    if call_context:
+    if in_call_context:
       for symbolic_loss in symbolic_losses:
         self._losses.append(symbolic_loss)
     else:
@@ -963,7 +956,7 @@ class Layer(module.Module):
 
     from_metric_obj = hasattr(value, '_metric_obj')
     is_symbolic = tf_utils.is_symbolic_tensor(value)
-    call_context = base_layer_utils.is_in_call_context()
+    in_call_context = base_layer_utils.call_context().in_call
 
     if name is None and not from_metric_obj:
       # Eg. `self.add_metric(math_ops.reduce_sum(x), aggregation='mean')`
@@ -981,7 +974,7 @@ class Layer(module.Module):
                        '`self.add_metric(tf.reduce_sum(inputs), '
                        'name=\'mean_activation\', aggregation=\'mean\')`')
 
-    if call_context:
+    if in_call_context:
       # TF Function path should take the eager path.
       if is_symbolic and not base_layer_utils.is_in_tf_function():
         self._symbolic_add_metric(value, aggregation, name)
@@ -1011,6 +1004,8 @@ class Layer(module.Module):
       new_layers.append(add_metric_layer)
       self._insert_layers(new_layers)
 
+  @deprecation.deprecated_args(None, '`inputs` is now automatically inferred',
+                               'inputs')
   @doc_controls.for_subclass_implementers
   def add_update(self, updates, inputs=None):
     """Add update op(s), potentially dependent on layer inputs.
@@ -1044,15 +1039,22 @@ class Layer(module.Module):
         A step counter might fall into this category.
     """
     updates = generic_utils.to_list(updates)
+    call_context = base_layer_utils.call_context()
 
     # All updates can be run immediately in Eager or in a tf.function.
     if base_layer_utils.is_in_eager_or_tf_function():
-      if not base_layer_utils.is_in_frozen_context():
+      if not call_context.frozen:
         for update in updates:
           if callable(update):
             update()
       return
 
+    if call_context.in_call:
+      relevant_inputs = call_context.inputs
+    else:
+      inbound_nodes = getattr(self, '_inbound_nodes', [])
+      relevant_inputs = [node.input_tensors for node in inbound_nodes]
+
     def process_update(x):
       """Standardize update ops.
 
@@ -1074,7 +1076,9 @@ class Layer(module.Module):
         update = x.op
       else:
         update = ops.convert_to_tensor(x)
-      update._unconditional_update = (inputs is None)
+
+      reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, [update])
+      update._unconditional_update = update not in reachable
       update._in_cross_replica_context = (
           ds_context.has_strategy() and ds_context.in_cross_replica_context())
       return update
@@ -1082,8 +1086,7 @@ class Layer(module.Module):
     updates = [process_update(x) for x in updates]
     # Non-callable Updates are run automatically inside `call` in V2, so
     # they do not need to be tracked later.
-    if (ops.executing_eagerly_outside_functions() and
-        base_layer_utils.is_in_call_context()):
+    if ops.executing_eagerly_outside_functions() and call_context.in_call:
       updates = [u for u in updates if callable(u)]
     self._updates += updates
 
@@ -1111,10 +1114,10 @@ class Layer(module.Module):
     if not params:
       return
     weight_value_tuples = []
-    param_values = backend.batch_get_value(params)
-    for pv, p, w in zip(param_values, params, weights):
-      if pv.shape != w.shape:
-        raise ValueError('Layer weight shape ' + str(pv.shape) +
+    for p, w in zip(params, weights):
+      ref_shape = p.shape
+      if not ref_shape.is_compatible_with(w.shape):
+        raise ValueError('Layer weight shape ' + str(ref_shape) +
                          ' not compatible with '
                          'provided weight shape ' + str(w.shape))
       weight_value_tuples.append((p, w))
@@ -1631,6 +1634,8 @@ class Layer(module.Module):
               array_ops.shape(output)[0], activity_loss.dtype)
           # Make activity regularization strength batch-agnostic.
           mean_activity_loss = activity_loss / batch_size
+          base_layer_utils.check_graph_consistency(
+              mean_activity_loss, method='activity_regularizer')
           self.add_loss(mean_activity_loss, inputs=inputs)
 
   def _set_mask_metadata(self, inputs, outputs, previous_mask):
@@ -1858,30 +1863,33 @@ class Layer(module.Module):
 
   def _maybe_build(self, inputs):
     # Check input assumptions set before layer building, e.g. input rank.
-    if self.built:
-      return
+    if not self.built:
+      input_spec.assert_input_compatibility(
+          self.input_spec, inputs, self.name)
+      input_list = nest.flatten(inputs)
+      if input_list and self._dtype is None:
+        try:
+          self._dtype = input_list[0].dtype.base_dtype.name
+        except AttributeError:
+          pass
+      input_shapes = None
+      if all(hasattr(x, 'shape') for x in input_list):
+        input_shapes = nest.map_structure(lambda x: x.shape, inputs)
+      # Only call `build` if the user has manually overridden the build method.
+      if not hasattr(self.build, '_is_default'):
+        # Any setup work performed only once should happen in an `init_scope`
+        # to avoid creating symbolic Tensors that will later pollute any eager
+        # operations.
+        with tf_utils.maybe_init_scope(self):
+          self.build(input_shapes)
+      # We must set self.built since user defined build functions are not
+      # constrained to set self.built.
+      self.built = True
 
-    input_spec.assert_input_compatibility(
-        self.input_spec, inputs, self.name)
-    input_list = nest.flatten(inputs)
-    if input_list and self._dtype is None:
-      try:
-        self._dtype = input_list[0].dtype.base_dtype.name
-      except AttributeError:
-        pass
-    input_shapes = None
-    if all(hasattr(x, 'shape') for x in input_list):
-      input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-    # Only call `build` if the user has manually overridden the build method.
-    if not hasattr(self.build, '_is_default'):
-      # Any setup work performed only once should happen in an `init_scope`
-      # to avoid creating symbolic Tensors that will later pollute any eager
-      # operations.
-      with tf_utils.maybe_init_scope(self):
-        self.build(input_shapes)
-    # We must set self.built since user defined build functions are not
-    # constrained to set self.built.
-    self.built = True
+    # Optionally load weight values specified at layer instantiation.
+    if getattr(self, '_initial_weights', None) is not None:
+      self.set_weights(self._initial_weights)
+      self._initial_weights = None
 
   def _symbolic_call(self, inputs):
     input_shapes = nest.map_structure(lambda x: x.shape, inputs)
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 8083dc53b5f..7114a73a692 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -29,6 +29,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -412,6 +413,39 @@ class BaseLayerTest(keras_parameterized.TestCase):
     layer.trainable = True
     self.assertListEqual(layer.trainable_weights, [layer.w])
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_passing_initial_weights_values(self):
+    kernel_value = np.random.random((10, 2))
+    layer_with_weights = keras.layers.Dense(
+        2, use_bias=False, weights=[kernel_value])
+
+    model = testing_utils.get_model_from_layers([layer_with_weights],
+                                                input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    inputs = np.random.random((3, 10))
+    out = model.predict(inputs)
+    self.assertAllClose(model.layers[-1].get_weights()[0], kernel_value)
+    self.assertAllClose(out, np.dot(inputs, kernel_value))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_set_weights_and_get_weights(self):
+    layer = keras.layers.Dense(2)
+    layer.build((None, 10))
+    kernel = np.random.random((10, 2))
+    bias = np.random.random((2,))
+    layer.set_weights([kernel, bias])
+    weights = layer.get_weights()
+    self.assertEqual(len(weights), 2)
+    self.assertAllClose(weights[0], kernel)
+    self.assertAllClose(weights[1], bias)
+    with self.assertRaisesRegexp(
+        ValueError, 'but the layer was expecting 2 weights'):
+      layer.set_weights([1, 2, 3])
+    with self.assertRaisesRegexp(
+        ValueError, 'not compatible with provided weight shape'):
+      layer.set_weights([kernel.T, bias])
+
 
 class SymbolicSupportTest(test.TestCase):
 
@@ -499,10 +533,15 @@ class SymbolicSupportTest(test.TestCase):
 
     try:
       _ = TypeErrorLayer()(inputs)
-    except TypeError:
-      tb = traceback.extract_tb(sys.exc_info()[2])
-      last_entry = tb[-1]
-      function_name = last_entry[2]
+    except TypeError as e:
+      if hasattr(e, 'ag_error_metadata'):
+        self.assertIn('easily_identifiable_name', str(e))
+        # See ErrorMetadataBase in autograph/pyct/errors.py
+        function_name = e.ag_error_metadata.translated_stack[-1].function_name
+      else:
+        tb = traceback.extract_tb(sys.exc_info()[2])
+        last_entry = tb[-1]
+        function_name = last_entry[2]
       self.assertEqual(function_name, 'easily_identifiable_name')
 
   @test_util.run_in_graph_and_eager_modes
@@ -874,6 +913,27 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
                                    '`add_loss` in a control flow branch'):
         layer = MyLayer()(keras.Input((3,)))
 
+  @keras_parameterized.run_all_keras_modes
+  def test_conditional_callable_losses(self):
+    model = keras.Sequential([
+        keras.layers.Dense(
+            1, kernel_regularizer=keras.regularizers.l2(1e-4), input_shape=(1,))
+    ])
+
+    def assert_graph(t):
+      if not context.executing_eagerly():
+        self.assertEqual(t.graph, ops.get_default_graph())
+
+    @def_function.function
+    def get_losses(t):
+      if t < 0:
+        return math_ops.reduce_sum(model.losses) * t
+      else:
+        return math_ops.reduce_sum(model.losses)
+
+    assert_graph(get_losses(constant_op.constant(2.)))
+    assert_graph(get_losses(constant_op.constant(0.5)))
+
   @parameterized.named_parameters(('eager', True),
                                   ('symbolic', False))
   def test_conditional_metrics_in_call(self, eager):
@@ -907,6 +967,64 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
                                    '`add_metric` in a control flow branch'):
         layer = MyLayer()(keras.Input((3,)))
 
+  @parameterized.named_parameters(('eager', True), ('symbolic', False))
+  def test_conditional_activity_regularizer_in_call(self, eager):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model', dynamic=eager)
+        self.layer = keras.layers.Dense(2, activity_regularizer='l2')
+
+      def call(self, x, training=None):
+        if training:
+          return self.layer(x)
+        else:
+          return self.layer(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer='sgd')
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+
+    if eager:
+      model.fit(x, y, epochs=2, batch_size=5)
+    else:
+      with self.assertRaisesRegexp(
+          RuntimeError, '`activity_regularizer` in a control flow branch'):
+        model.fit(x, y, epochs=2, batch_size=5)
+
+  @parameterized.named_parameters(('eager', True), ('symbolic', False))
+  def test_conditional_activity_regularizer_with_wrappers_in_call(self, eager):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model', dynamic=eager)
+        self.layer = keras.layers.TimeDistributed(
+            keras.layers.Dense(2, activity_regularizer='l2'),
+            input_shape=(3, 4))
+
+      def call(self, x, training=None):
+        if training:
+          return self.layer(x)
+        else:
+          return self.layer(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer='sgd')
+
+    x = np.ones(shape=(10, 3, 4))
+    y = np.ones(shape=(10, 3, 2))
+
+    if eager:
+      model.fit(x, y, epochs=2, batch_size=5)
+    else:
+      with self.assertRaisesRegexp(
+          RuntimeError, '`activity_regularizer` in a control flow branch'):
+        model.fit(x, y, epochs=2, batch_size=5)
+
 
 _LAYERS_TO_TEST = [
     (keras.layers.Dense, (1,), collections.OrderedDict(units=[1])),
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 6b69620f69e..5dd59b1a697 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -152,7 +152,7 @@ def make_variable(name,
       collections=collections,
       synchronization=synchronization,
       aggregation=aggregation,
-      shape=variable_shape if variable_shape.rank else None)
+      shape=variable_shape if variable_shape else None)
 
 
 def collect_previous_mask(input_tensors):
@@ -270,7 +270,7 @@ def needs_keras_history(tensors):
     Bool, whether at least one Tensor needs to be wrapped.
   """
   input_tensors = nest.flatten(tensors)
-  if is_in_call_context() or all(
+  if call_context().in_call or all(
       getattr(tensor, '_keras_history', None) is not None
       for tensor in input_tensors):
     # KerasHistory already set.
@@ -278,30 +278,9 @@ def needs_keras_history(tensors):
   return uses_keras_history(tensors)
 
 
-def is_in_call_context():
-  """Returns true if inside of a model/layer '__call__'."""
-  return getattr(_call_context, 'in_call', False)
-
-
-def is_in_frozen_context():
-  """Returns if currently executing inside a `call` of a frozen Layer.
-
-  A Layer is considered frozen if `layer.trainable=False`.
-
-  Returns:
-    Whether currently inside the `call` of a frozen Layer.
-  """
-  return getattr(_call_context, 'frozen', False)
-
-
 def is_in_keras_graph():
   """Returns if currently executing inside of a Keras graph."""
-  # Returns True even if in a subgraph of the Keras graph, such as those
-  # created by control flow ops.
-  if context.executing_eagerly():
-    return False
-  return (getattr(backend.get_graph(), 'name', None) == 'keras_graph' or
-          getattr(_call_context, 'in_keras_graph', False))
+  return call_context().in_keras_graph
 
 
 def is_in_eager_or_tf_function():
@@ -372,24 +351,66 @@ def mark_checked(tensors):
   nest.map_structure(_mark_checked, tensors)
 
 
-@tf_contextlib.contextmanager
-def call_context(layer):
-  """Scope that marks when we are currently inside a Layer/Model's `call`."""
-  was_in_call = is_in_call_context()
-  was_frozen = is_in_frozen_context()
-  was_in_keras_graph = getattr(_call_context, 'in_keras_graph', False)
-  _call_context.in_call = True
-  _call_context.in_keras_graph = (
-      was_in_keras_graph or
-      getattr(backend.get_graph(), 'name', None) == 'keras_graph')
-  if not layer.trainable:
-    _call_context.frozen = True
-  try:
-    yield
-  finally:
-    _call_context.in_call = was_in_call
-    _call_context.frozen = was_frozen
-    _call_context.in_keras_graph = was_in_keras_graph
+def call_context():
+  """Returns currently active `CallContext`."""
+  if getattr(_call_context, 'call_context', None) is None:
+    _call_context.call_context = CallContext()
+  return _call_context.call_context
+
+
+class CallContext(object):
+  """Keeps track of properties currently inside a Layer/Model's `call`.
+
+  Attributes:
+    layer: The `Layer` whose `call` is currently active.
+    inputs: The inputs to the currently active `Layer`.
+    frozen: Whether currently executing inside a `Layer` with `trainable` set to
+      `False`.
+    in_call: Whether currently inside the `call` of a Layer.
+    in_keras_graph: Whether executing inside the Keras Graph.
+  """
+
+  def __init__(self):
+    self.layer = None
+    self.inputs = None
+    self.frozen = False
+    self.in_call = False
+    self._in_keras_graph = False
+
+  @tf_contextlib.contextmanager
+  def enter(self, layer, inputs, build_graph):
+    """Push a Layer and its inputs and state onto the current call context."""
+    prev_layer = self.layer
+    prev_inputs = self.inputs
+    prev_frozen = self.frozen
+    prev_in_call = self.in_call
+    prev_in_keras_graph = self._in_keras_graph
+
+    self.layer = layer
+    self.inputs = inputs
+    self.frozen = self.frozen or not layer.trainable
+    self.in_call = True
+    self._in_keras_graph = (
+        self._in_keras_graph or
+        (build_graph and
+         getattr(backend.get_graph(), 'name', None) == 'keras_graph'))
+    try:
+      yield
+    finally:
+      self.layer = prev_layer
+      self.inputs = prev_inputs
+      self.frozen = prev_frozen
+      self.in_call = prev_in_call
+      self._in_keras_graph = prev_in_keras_graph
+
+  @property
+  def in_keras_graph(self):
+    # Returns True even if in a subgraph of the Keras graph, such as those
+    # created by control flow ops.
+    if context.executing_eagerly():
+      return False
+    return (self._in_keras_graph or
+            getattr(backend.get_graph(), 'name', None) == 'keras_graph')
 
 
 def training_arg_passed_to_call(argspec, args, kwargs):
@@ -455,6 +476,43 @@ def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
                                  (control_flow_util_v2.CondBranchFuncGraph,
                                   control_flow_util_v2.WhileCondFuncGraph,
                                   control_flow_util_v2.WhileBodyFuncGraph)))):
+    if method == 'activity_regularizer':
+      bad_example = """
+      class TestModel(tf.keras.Model):
+
+        def __init__(self):
+          super(TestModel, self).__init__(name='test_model')
+          self.dense = tf.keras.layers.Dense(2, activity_regularizer='l2')
+
+        def call(self, x, training=None):
+          if training:
+            return self.dense(x)
+          else:
+            return self.dense(x)
+      """
+      correct_example = """
+      class TestModel(tf.keras.Model):
+
+        def __init__(self):
+          super(TestModel, self).__init__(name='test_model')
+          self.dense = tf.keras.layers.Dense(2, activity_regularizer='l2')
+
+        def call(self, x, training=None):
+          return self.dense(x)
+      """
+      raise RuntimeError(
+          'You are using a layer with `activity_regularizer` in a control flow '
+          'branch, e.g.:\n{bad_example}\nThis is currently not supported. '
+          'Please move your call to the layer with `activity_regularizer` out '
+          'of the control flow branch, e.g.:\n{correct_example}\n'
+          'You can also resolve this by marking your outer model/layer dynamic'
+          ' (eager-only) by passing `dynamic=True` to the layer constructor. '
+          'Any kind of control flow is supported with dynamic layers. '
+          'Note that using `dynamic=True` requires you to implement static '
+          'shape inference in the `compute_output_shape(input_shape)` '
+          'method.'.format(
+              bad_example=bad_example, correct_example=correct_example))
+
     if method == 'add_metric':
       bad_example = """
       def call(self, inputs, training=None):
@@ -509,8 +567,7 @@ def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
         'You are using the method `{method}` in a control flow branch '
         'in your layer, e.g.:\n{bad_example}\n'
         'This is not currently supported. '
-        'You should either use static control flow (`tf.cond`) '
-        'or move your call to {method} out of the control flow branch, '
+        'Please move your call to {method} out of the control flow branch, '
         'e.g.:\n{correct_example}\n'
         'You can also resolve this by marking your layer '
         'as dynamic (eager-only) by passing '
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index 3bed40b0846..8c3d7a9d1a7 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -238,6 +238,26 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     }, np.arange(10, 100))
     print(model.fit(*data_bloated_dict, epochs=1))
 
+  @keras_parameterized.run_all_keras_modes
+  def test_string_input(self):
+    x = {'age': np.random.random((1024, 1)),
+         'cabin': np.array(['a'] * 1024)}
+    y = np.random.randint(2, size=(1024, 1))
+    ds1 = dataset_ops.Dataset.from_tensor_slices(x)
+    ds2 = dataset_ops.Dataset.from_tensor_slices(y)
+    dataset = dataset_ops.Dataset.zip((ds1, ds2)).batch(4)
+    categorical_cols = [fc.categorical_column_with_hash_bucket('cabin', 10)]
+    feature_cols = ([fc.numeric_column('age')]
+                    + [fc.indicator_column(cc) for cc in categorical_cols])
+    layers = [fc.DenseFeatures(feature_cols),
+              keras.layers.Dense(128),
+              keras.layers.Dense(1)]
+
+    model = keras.models.Sequential(layers)
+    model.compile(keras.optimizers.SGD(0.1),
+                  loss=keras.losses.BinaryCrossentropy())
+    model.fit(dataset)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index e6e6f3448f4..10829a2622f 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -342,7 +342,12 @@ class Network(base_layer.Layer):
       self.input_names.append(layer.name)
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
-        self._feed_input_shapes.append(backend.int_shape(self.inputs[i]))
+        # Use batch_input_shape here because non-eager composite tensors may not
+        # have a shape attribute that's meaningful (sparse, for instance, has
+        # a tensor that's non-constant and needs to be fed). This means that
+        # input layers that create placeholders will need to have the
+        # batch_input_shape attr to allow for input shape validation.
+        self._feed_input_shapes.append(layer._batch_input_shape)
         self._feed_inputs.append(layer.input)
 
   def _set_output_names(self):
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 464b9a1ba03..582186b4c6f 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,13 +24,14 @@ import numpy as np
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import monitoring
+from tensorflow.python.framework import composite_tensor_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
@@ -425,6 +426,30 @@ class Model(network.Network):
   def run_eagerly(self, value):
     self._run_eagerly = value
 
+  def _select_training_loop(self, inputs):
+    """Select training loop for fit/eval/predict based on the inputs."""
+    # Case 1: distribution strategy.
+    if self._distribution_strategy:
+      if K.in_multi_worker_mode():
+        return training_distributed.DistributionMultiWorkerTrainingLoop()
+      else:
+        return training_distributed.DistributionSingleWorkerTrainingLoop()
+
+    # Case 2: generator-like. Input is Python generator, or Sequence object,
+    # or a non-distributed Dataset or iterator in eager execution.
+    if data_utils.is_generator_or_sequence(inputs):
+      return training_generator.GeneratorOrSequenceTrainingLoop()
+    if training_utils.is_eager_dataset_or_iterator(inputs):
+      return training_generator.EagerDatasetOrIteratorTrainingLoop()
+
+    # Case 3: Symbolic tensors or Numpy array-like.
+    # This includes Datasets and iterators in graph mode (since they
+    # generate symbolic tensors).
+    if self.run_eagerly:
+      return training_generator.GeneratorLikeTrainingLoop()
+    else:
+      return training_arrays.ArrayLikeTrainingLoop()
+
   def fit(self,
           x=None,
           y=None,
@@ -590,194 +615,33 @@ class Model(network.Network):
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
-          'The `nb_epoch` argument in `fit` '
-          'has been renamed `epochs`.')
+          'The `nb_epoch` argument in `fit` has been renamed `epochs`.')
       epochs = kwargs.pop('nb_epoch')
     if kwargs:
       raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
     self._assert_compile_was_called()
 
-    # Case 1: distribution strategy.
-    if self._distribution_strategy:
-      if K.in_multi_worker_mode():
-        # Multi-Worker mode runs the Keras training loop on multiple
-        # servers via the Distribute Coordinator.
-        def _worker_fn(_):
-          """Run training inside the distributed coordinator."""
-          filtered_callbacks = distributed_training_utils \
-              .filter_distributed_callbacks(callbacks)
-          return training_distributed.fit_distributed(
-              self,
-              x=x,
-              y=y,
-              batch_size=batch_size,
-              epochs=epochs,
-              verbose=verbose,
-              callbacks=filtered_callbacks,
-              validation_split=validation_split,
-              validation_data=validation_data,
-              shuffle=shuffle,
-              class_weight=class_weight,
-              sample_weight=sample_weight,
-              initial_epoch=initial_epoch,
-              steps_per_epoch=steps_per_epoch,
-              validation_steps=validation_steps,
-              validation_freq=validation_freq)
-
-        # Independent worker only for now.
-        return dc.run_distribute_coordinator(
-            _worker_fn,
-            self._distribution_strategy,
-            mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
-      else:
-        return training_distributed.fit_distributed(
-            self,
-            x=x,
-            y=y,
-            batch_size=batch_size,
-            epochs=epochs,
-            verbose=verbose,
-            callbacks=callbacks,
-            validation_split=validation_split,
-            validation_data=validation_data,
-            shuffle=shuffle,
-            class_weight=class_weight,
-            sample_weight=sample_weight,
-            initial_epoch=initial_epoch,
-            steps_per_epoch=steps_per_epoch,
-            validation_steps=validation_steps,
-            validation_freq=validation_freq)
-
-    batch_size = self._validate_or_infer_batch_size(
-        batch_size, steps_per_epoch, x)
-
-    # Case 2: generator-like. Input is Python generator, or Sequence object,
-    # or a non-distributed Dataset or iterator in eager execution.
-    if data_utils.is_generator_or_sequence(x):
-      training_utils.check_generator_arguments(
-          y, sample_weight, validation_split=validation_split)
-      return self.fit_generator(
-          x,
-          steps_per_epoch=steps_per_epoch,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_data=validation_data,
-          validation_steps=validation_steps,
-          validation_freq=validation_freq,
-          class_weight=class_weight,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch)
-    if training_utils.is_eager_dataset_or_iterator(x):
-      # Make sure that y, sample_weights, validation_split are not passed.
-      training_utils.validate_dataset_input(x, y, sample_weight,
-                                            validation_split)
-      if (isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
-          and shuffle):
-        training_utils.verify_dataset_shuffled(x)
-
-      return self.fit_generator(
-          x,
-          steps_per_epoch=steps_per_epoch,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_data=validation_data,
-          validation_steps=validation_steps,
-          validation_freq=validation_freq,
-          class_weight=class_weight,
-          workers=0,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch)
-
-    # Case 3: Symbolic tensors or Numpy array-like.
-    # This includes Datasets and iterators in graph mode (since they
-    # generate symbolic tensors).
-    x, y, sample_weights = self._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        class_weight=class_weight,
+    func = self._select_training_loop(x)
+    return func.fit(
+        self,
+        x=x,
+        y=y,
         batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps_per_epoch',
-        steps=steps_per_epoch,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
         validation_split=validation_split,
-        shuffle=shuffle)
-
-    # Prepare validation data.
-    if validation_data:
-      val_x, val_y, val_sample_weights = self._unpack_validation_data(
-          validation_data)
-      val_x, val_y, val_sample_weights = self._standardize_user_data(
-          val_x,
-          val_y,
-          sample_weight=val_sample_weights,
-          batch_size=batch_size,
-          steps=validation_steps,
-          steps_name='validation_steps')
-    elif validation_split and 0. < validation_split < 1.:
-      if training_utils.has_symbolic_tensors(x):
-        raise ValueError('If your data is in the form of symbolic tensors, '
-                         'you cannot use `validation_split`.')
-      if hasattr(x[0], 'shape'):
-        split_at = int(x[0].shape[0] * (1. - validation_split))
-      else:
-        split_at = int(len(x[0]) * (1. - validation_split))
-      x, val_x = (slice_arrays(x, 0, split_at), slice_arrays(x, split_at))
-      y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
-      if sample_weights:
-        sample_weights, val_sample_weights = (
-            slice_arrays(sample_weights, 0, split_at),
-            slice_arrays(sample_weights, split_at),
-        )
-      else:
-        val_sample_weights = None
-    else:
-      if validation_steps:
-        raise ValueError('`validation_steps` should not be specified if '
-                         '`validation_data` is None.')
-      val_x = None
-      val_y = None
-      val_sample_weights = None
-
-    if self.run_eagerly:
-      return training_generator.fit_generator(
-          self, (x, y, sample_weights),
-          steps_per_epoch=steps_per_epoch,
-          batch_size=batch_size,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_data=validation_data,
-          validation_steps=validation_steps,
-          validation_freq=validation_freq,
-          workers=0,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch,
-          steps_name='steps_per_epoch')
-    else:
-      return training_arrays.fit_loop(
-          self,
-          x,
-          y,
-          sample_weights=sample_weights,
-          batch_size=batch_size,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          val_inputs=val_x,
-          val_targets=val_y,
-          val_sample_weights=val_sample_weights,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps,
-          validation_freq=validation_freq,
-          steps_name='steps_per_epoch')
+        validation_data=validation_data,
+        shuffle=shuffle,
+        class_weight=class_weight,
+        sample_weight=sample_weight,
+        initial_epoch=initial_epoch,
+        steps_per_epoch=steps_per_epoch,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing)
 
   def evaluate(self,
                x=None,
@@ -868,96 +732,19 @@ class Model(network.Network):
     _keras_api_gauge.get_cell('evaluate').set(True)
     self._assert_compile_was_called()
 
-    # Case 1: distribution strategy.
-    if self._distribution_strategy:
-      if K.in_multi_worker_mode():
-        # Multi-Worker mode runs the Keras evaluation loop on multiple
-        # servers via the Distribute Coordinator.
-        def _worker_fn(_):
-          """Run evaluation inside the distributed coordinator."""
-          filtered_callbacks = distributed_training_utils \
-              .filter_distributed_callbacks(callbacks)
-          return training_distributed.evaluate_distributed(
-              self,
-              x=x,
-              y=y,
-              batch_size=batch_size,
-              verbose=verbose,
-              sample_weight=sample_weight,
-              steps=steps,
-              callbacks=filtered_callbacks)
-
-        # Independent worker only for now.
-        return dc.run_distribute_coordinator(
-            _worker_fn,
-            self._distribution_strategy,
-            mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
-      else:
-        return training_distributed.evaluate_distributed(
-            self,
-            x=x,
-            y=y,
-            batch_size=batch_size,
-            verbose=verbose,
-            sample_weight=sample_weight,
-            steps=steps,
-            callbacks=callbacks)
-
-    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
-
-    # Case 2: generator-like. Input is Python generator, or Sequence object,
-    # or a non-distributed Dataset or iterator in eager execution.
-    if data_utils.is_generator_or_sequence(x):
-      training_utils.check_generator_arguments(y, sample_weight)
-      return self.evaluate_generator(
-          x,
-          steps=steps,
-          verbose=verbose,
-          callbacks=callbacks,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing)
-    if training_utils.is_eager_dataset_or_iterator(x):
-      # Make sure that y, sample_weights are not passed.
-      training_utils.validate_dataset_input(x, y, sample_weight)
-      return training_generator.evaluate_generator(
-          self, x,
-          steps=steps,
-          batch_size=batch_size,
-          verbose=verbose,
-          workers=0,
-          callbacks=callbacks)
-
-    # Case 3: Symbolic tensors or Numpy array-like.
-    # This includes Datasets and iterators in graph mode (since they
-    # generate symbolic tensors).
-    x, y, sample_weights = self._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
+    func = self._select_training_loop(x)
+    return func.evaluate(
+        self,
+        x=x,
+        y=y,
         batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps',
-        steps=steps)
-
-    if self.run_eagerly:
-      return training_generator.evaluate_generator(
-          self, (x, y, sample_weights),
-          steps=steps,
-          batch_size=batch_size,
-          verbose=verbose,
-          workers=0,
-          callbacks=callbacks)
-    else:
-      return training_arrays.test_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
-          batch_size=batch_size,
-          verbose=verbose,
-          steps=steps,
-          callbacks=callbacks)
+        verbose=verbose,
+        sample_weight=sample_weight,
+        steps=steps,
+        callbacks=callbacks,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing)
 
   def predict(self,
               x,
@@ -1021,61 +808,18 @@ class Model(network.Network):
             that is not a multiple of the batch size.
     """
     _keras_api_gauge.get_cell('predict').set(True)
-    # Case 1: distribution strategy.
-    if self._distribution_strategy:
-      return training_distributed.predict_distributed(self,
-                                                      x=x,
-                                                      batch_size=batch_size,
-                                                      verbose=verbose,
-                                                      steps=steps,
-                                                      callbacks=callbacks)
 
-    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
-
-    # Case 2: generator-like. Input is Python generator, or Sequence object,
-    # or a non-distributed Dataset or iterator in eager execution.
-    if data_utils.is_generator_or_sequence(x):
-      return self.predict_generator(
-          x,
-          steps=steps,
-          verbose=verbose,
-          callbacks=callbacks,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing)
-    if training_utils.is_eager_dataset_or_iterator(x):
-      return training_generator.predict_generator(
-          self,
-          x,
-          steps=steps,
-          batch_size=batch_size,
-          verbose=verbose,
-          workers=0,
-          callbacks=callbacks)
-
-    # Case 3: Symbolic tensors or Numpy array-like.
-    # This includes Datasets and iterators in graph mode (since they
-    # generate symbolic tensors).
-    x, _, _ = self._standardize_user_data(
-        x, check_steps=True, steps_name='steps', steps=steps)
-
-    if self.run_eagerly:
-      return training_generator.predict_generator(
-          self,
-          x,
-          steps=steps,
-          batch_size=batch_size,
-          verbose=verbose,
-          workers=0,
-          callbacks=callbacks)
-    else:
-      return training_arrays.predict_loop(
-          self,
-          x,
-          batch_size=batch_size,
-          verbose=verbose,
-          steps=steps,
-          callbacks=callbacks)
+    func = self._select_training_loop(x)
+    return func.predict(
+        self,
+        x=x,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing)
 
   def reset_metrics(self):
     """Resets the state of metrics."""
@@ -1554,6 +1298,40 @@ class Model(network.Network):
         verbose=verbose,
         callbacks=callbacks)
 
+  def _split_training_and_validation_data(self, x, y, sample_weights,
+                                          validation_split):
+    """Split input data into train/eval section based on validation_split."""
+    if training_utils.has_symbolic_tensors(x):
+      raise ValueError('If your data is in the form of symbolic tensors, '
+                       'you cannot use `validation_split`.')
+    if hasattr(x[0], 'shape'):
+      split_at = int(x[0].shape[0] * (1. - validation_split))
+    else:
+      split_at = int(len(x[0]) * (1. - validation_split))
+    x, val_x = (slice_arrays(x, 0, split_at), slice_arrays(x, split_at))
+    y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
+    if sample_weights:
+      sample_weights, val_sample_weights = (
+          slice_arrays(sample_weights, 0, split_at),
+          slice_arrays(sample_weights, split_at),
+      )
+    else:
+      val_sample_weights = None
+    return x, y, sample_weights, val_x, val_y, val_sample_weights
+
+  def _prepare_validation_data(self, validation_data, batch_size,
+                               validation_steps):
+    """Unpack and check the validation data."""
+    val_x, val_y, val_sample_weights = self._unpack_validation_data(
+        validation_data)
+    return self._standardize_user_data(
+        val_x,
+        val_y,
+        sample_weight=val_sample_weights,
+        batch_size=batch_size,
+        steps=validation_steps,
+        steps_name='validation_steps')
+
   def _validate_compile_param_for_distribution_strategy(
       self, run_eagerly, sample_weight_mode, target_tensors, weighted_metrics):
     # Validate that arguments passed by the user to `compile` are supported by
@@ -2214,10 +1992,9 @@ class Model(network.Network):
 
       with K.get_graph().as_default():
         with K.name_scope('training'):
-          with K.name_scope(self.optimizer.__class__.__name__):
-            # Training updates
-            updates = self.optimizer.get_updates(
-                params=self._collected_trainable_weights, loss=self.total_loss)
+          # Training updates
+          updates = self.optimizer.get_updates(
+              params=self._collected_trainable_weights, loss=self.total_loss)
       # Unconditional updates
       updates += self.get_updates_for(None)
       # Conditional updates relevant to this model
@@ -2516,6 +2293,7 @@ class Model(network.Network):
     # Whether this is a subclassed model that expects dictionary inputs
     # rather than list inputs (e.g. FeatureColumn-based models).
     dict_inputs = False
+
     if not self.inputs:
       # We need to use `x_input` to set the model inputs.
 
@@ -2527,8 +2305,10 @@ class Model(network.Network):
       else:
         x_input = x
         y_input = y
+
       # We type-check that `x_input` and `y_input` are either single arrays
-      # or lists of arrays.
+      # or lists of arrays, and extract a flat list of inputs from the passed
+      # structure.
       if isinstance(x_input, (list, tuple)):
         if not all(isinstance(v, np.ndarray) or
                    tensor_util.is_tensor(v) for v in x_input):
@@ -2546,16 +2326,35 @@ class Model(network.Network):
                            'array or a list of arrays. You passed: x=' + str(x))
         all_inputs.append(x_input)
 
+      # Now that we have a flat set of inputs, we make sure that none of them
+      # are CompositeTensors or CompositeTensorValues of any type (or scipy
+      # sparse arrays, which we treat as SparseTensor values). We cannot safely
+      # infer input data from an arbitrary composite tensor, so we don't try -
+      # users should explictly add composite tensor inputs to their subclassed
+      # models.
+      for input_tensor in all_inputs:
+        if (composite_tensor_utils.is_composite_or_composite_value(input_tensor)
+           ):
+          # TODO(b/132691975): Document subclass-model CT input handling.
+          raise ValueError(
+              'All implicitly derived inputs to subclassed Models must be '
+              'tf.Tensors (found %s). To add non-tf.Tensor inputs, please call '
+              'self._add_inputs(tf.keras.Input/SparseInput/RaggedInput (etc)) '
+              'in your subclassed Model object.' % (input_tensor,))
+
       # Build the model using the retrieved inputs (value or symbolic).
       # If values or generated from a dataset, then in symbolic-mode
       # placeholders will be created to match the value shapes.
       is_build_called = True
       if is_dataset:
-        cast_inputs = nest.map_structure(lambda v: v.shape, x_input)
+        def create_tensor_spec(t):
+          return tensor_spec.TensorSpec(t.shape, t.dtype)
+        cast_inputs = nest.map_structure(create_tensor_spec, x_input)
       elif training_utils.has_tensors(x_input):
         cast_inputs = training_utils.cast_if_floating_dtype(x_input)
       else:
         cast_inputs = x_input
+
       self._set_inputs(cast_inputs)
     else:
       y_input = y
@@ -2751,7 +2550,7 @@ class Model(network.Network):
 
     Args:
       inputs: Single array, or list of arrays. The arrays could be placeholders,
-        Numpy arrays, data tensors, or TensorShapes.
+        Numpy arrays, data tensors, or TensorSpecs.
         - if placeholders: the model is built on top of these placeholders,
           and we expect Numpy data to be fed for them when calling `fit`/etc.
         - if Numpy data or TensorShapes: we create placeholders matching the
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index c6d44befa87..94845a218b1 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -141,9 +141,6 @@ def model_iteration(model,
           inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
     input_iterator = _get_iterator(inputs, model._distribution_strategy)
 
-  if mode == ModeKeys.TRAIN:
-    _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
-
   # Enter tf.distribute.Strategy scope.
   if model._distribution_strategy:
     scope = distributed_training_utils.distributed_scope(
@@ -175,6 +172,7 @@ def model_iteration(model,
     # `ins` is a function when a distribute strategy is used in Eager mode.  In
     # that case `is_dataset` is True.  The code branches that have requirements
     # about the type of `ins` do not trigger in the distributed case.
+
   if not is_dataset:
     num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
                                                      steps_per_epoch)
@@ -198,6 +196,14 @@ def model_iteration(model,
     val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
     val_inputs = _prepare_feed_values(
         model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST)
+    # Get num steps for printing.
+    val_samples_or_steps = validation_steps
+  else:
+    # Get num samples for printing.
+    val_samples_or_steps = val_inputs and val_inputs[0].shape[0] or None
+
+  if mode == ModeKeys.TRAIN and verbose:
+    _print_train_info(num_samples_or_steps, val_samples_or_steps, is_dataset)
 
   # Configure callbacks.
   count_mode = 'steps' if use_steps else 'samples'
@@ -445,11 +451,14 @@ def _get_model_feed(model, mode):
   return feed
 
 
-def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
-  if (val_inputs and steps_per_epoch is None and verbose and inputs and
-      hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
-    print('Train on %d samples, validate on %d samples' %
-          (inputs[0].shape[0], val_inputs[0].shape[0]))
+def _print_train_info(num_samples_or_steps, val_samples_or_steps, is_dataset):
+  increment = 'steps' if is_dataset else 'samples'
+  msg = 'Train on {0} {increment}'.format(
+      num_samples_or_steps, increment=increment)
+  if val_samples_or_steps:
+    msg += ', validate on {0} {increment}'.format(
+        val_samples_or_steps, increment=increment)
+  print(msg)
 
 
 def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
@@ -538,3 +547,126 @@ test_loop = functools.partial(
     model_iteration, mode=ModeKeys.TEST, shuffle=False)
 predict_loop = functools.partial(
     model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
+
+
+class ArrayLikeTrainingLoop(training_utils.TrainingLoop):
+  """TrainingLoop that handle inputs like array.
+
+  This is the default handler for most of the input data types, includes
+  symbolic tensors or Numpy array-like, Datasets and iterators in graph mode
+  (since they generate symbolic tensors). This Function is used to handle model
+  with `run_eagerly` = False.
+  """
+
+  def fit(self,
+          model,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          **kwargs):
+    batch_size = model._validate_or_infer_batch_size(batch_size,
+                                                     steps_per_epoch, x)
+
+    x, y, sample_weights = model._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        class_weight=class_weight,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps_per_epoch',
+        steps=steps_per_epoch,
+        validation_split=validation_split,
+        shuffle=shuffle)
+
+    if validation_data:
+      val_x, val_y, val_sample_weights = model._prepare_validation_data(
+          validation_data, batch_size, validation_steps)
+    elif validation_split and 0. < validation_split < 1.:
+      (x, y, sample_weights, val_x, val_y,
+       val_sample_weights) = model._split_training_and_validation_data(
+           x, y, sample_weights, validation_split)
+    else:
+      if validation_steps:
+        raise ValueError('`validation_steps` should not be specified if '
+                         '`validation_data` is None.')
+      val_x, val_y, val_sample_weights = None, None, None
+
+    return fit_loop(
+        model,
+        inputs=x,
+        targets=y,
+        sample_weights=sample_weights,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        val_inputs=val_x,
+        val_targets=val_y,
+        val_sample_weights=val_sample_weights,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch,
+        steps_per_epoch=steps_per_epoch,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq,
+        steps_name='steps_per_epoch')
+
+  def evaluate(self,
+               model,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               **kwargs):
+    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+    x, y, sample_weights = model._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps',
+        steps=steps)
+    return test_loop(
+        model,
+        inputs=x,
+        targets=y,
+        sample_weights=sample_weights,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks)
+
+  def predict(self,
+              model,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              **kwargs):
+    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+    x, _, _ = model._standardize_user_data(
+        x, check_steps=True, steps_name='steps', steps=steps)
+    return predict_loop(
+        model,
+        x,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks)
diff --git a/tensorflow/python/keras/engine/training_arrays_test.py b/tensorflow/python/keras/engine/training_arrays_test.py
index daa4735c838..280c3699ee4 100644
--- a/tensorflow/python/keras/engine/training_arrays_test.py
+++ b/tensorflow/python/keras/engine/training_arrays_test.py
@@ -18,9 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+import sys
 
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import core
@@ -57,5 +63,53 @@ class ValidationDatasetNoLimitTest(keras_parameterized.TestCase):
                            evaluation[-1])
 
 
+class PrintTrainingInfoTest(parameterized.TestCase):
+
+  @test_util.run_v1_only("Only relevant in graph mode.")
+  def test_print_info_with_datasets(self):
+    """Print training info should work with val datasets (b/133391839)."""
+
+    model = keras.models.Sequential([keras.layers.Dense(1, input_shape=(1,))])
+    model.compile(loss="mse", optimizer="sgd")
+
+    dataset = dataset_ops.Dataset.from_tensors(
+        ([1.], [1.])).repeat(100).batch(10)
+
+    val_dataset = dataset_ops.Dataset.from_tensors(
+        ([1.], [1.])).repeat(50).batch(10)
+
+    mock_stdout = six.StringIO()
+    with test.mock.patch.object(sys, "stdout", mock_stdout):
+      model.fit(dataset, epochs=2, validation_data=val_dataset)
+
+    self.assertIn(
+        "Train on 10 steps, validate on 5 steps", mock_stdout.getvalue())
+
+  @parameterized.named_parameters(
+      ("with_validation", True), ("without_validation", False))
+  @test_util.run_v1_only("Only relevant in graph mode.")
+  def test_print_info_with_numpy(self, do_validation):
+    """Print training info should work with val datasets (b/133391839)."""
+
+    model = keras.models.Sequential([keras.layers.Dense(1, input_shape=(2,))])
+    model.compile(loss="mse", optimizer="sgd")
+
+    dataset = np.arange(200).reshape(100, 2)
+
+    if do_validation:
+      val_data = (np.arange(100).reshape(50, 2), np.arange(50).reshape(50, 1))
+    else:
+      val_data = None
+
+    mock_stdout = six.StringIO()
+    with test.mock.patch.object(sys, "stdout", mock_stdout):
+      model.fit(dataset, batch_size=10, epochs=2, validation_data=val_data)
+
+    self.assertIn("Train on 100 samples", mock_stdout.getvalue())
+
+    if do_validation:
+      self.assertIn(", validate on 50 samples", mock_stdout.getvalue())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 4d2fd152212..e25c6627d23 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
@@ -30,7 +31,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.distribute import distributed_training_utils
+from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
 from tensorflow.python.keras.engine import partial_batch_padding_handler as padding_util
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_utils
@@ -59,9 +60,8 @@ def fit_distributed(model,
                     validation_steps=None,
                     validation_freq=1):
   """Fit loop for Distribution Strategies."""
-  distributed_training_utils.validate_callbacks(callbacks, model.optimizer)
-  distributed_training_utils.validate_inputs(
-      x, y)
+  dist_utils.validate_callbacks(callbacks, model.optimizer)
+  dist_utils.validate_inputs(x, y)
 
   first_x_value = nest.flatten(x)[0]
   if isinstance(first_x_value, np.ndarray):
@@ -69,9 +69,12 @@ def fit_distributed(model,
     # functions and distribution strategy, we pass `mode` to selectively
     # relax the costraint to consume all the training samples.
     steps_per_epoch, batch_size = (
-        distributed_training_utils.get_input_params(
-            model._distribution_strategy, first_x_value, steps_per_epoch,
-            batch_size, mode=ModeKeys.TRAIN))
+        dist_utils.get_input_params(
+            model._distribution_strategy,
+            first_x_value,
+            steps_per_epoch,
+            batch_size,
+            mode=ModeKeys.TRAIN))
   batch_size = model._validate_or_infer_batch_size(
       batch_size, steps_per_epoch, x)
   dataset = model._distribution_standardize_user_data(
@@ -82,7 +85,7 @@ def fit_distributed(model,
       validation_split=validation_split,
       shuffle=shuffle,
       epochs=epochs)
-  if not distributed_training_utils.is_distributing_by_cloning(model):
+  if not dist_utils.is_distributing_by_cloning(model):
     with model._distribution_strategy.scope():
       (dataset, _, _) = model._standardize_user_data(
           dataset,
@@ -96,12 +99,15 @@ def fit_distributed(model,
   if validation_data:
     val_x, val_y, val_sample_weights = model._unpack_validation_data(
         validation_data)
-    distributed_training_utils.validate_inputs(val_x, val_y)
+    dist_utils.validate_inputs(val_x, val_y)
     first_valx_value = nest.flatten(val_x)[0]
     if isinstance(first_valx_value, np.ndarray):
-      validation_steps, _ = distributed_training_utils.get_input_params(
-          model._distribution_strategy, first_valx_value, validation_steps,
-          batch_size, mode=ModeKeys.TEST)
+      validation_steps, _ = dist_utils.get_input_params(
+          model._distribution_strategy,
+          first_valx_value,
+          validation_steps,
+          batch_size,
+          mode=ModeKeys.TEST)
     val_dataset = model._distribution_standardize_user_data(
         val_x, val_y,
         sample_weight=val_sample_weights,
@@ -114,7 +120,7 @@ def fit_distributed(model,
     raise ValueError('validation_split argument is not supported with '
                      'distribution strategies.')
 
-  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+  if dist_utils.is_tpu_strategy(model._distribution_strategy):
     return experimental_tpu_fit_loop(
         model,
         dataset,
@@ -152,11 +158,14 @@ def evaluate_distributed(model,
                          steps=None,
                          callbacks=None):
   """Evaluate loop for Distribution Strategies."""
-  distributed_training_utils.validate_inputs(x, y)
+  dist_utils.validate_inputs(x, y)
   first_x_value = nest.flatten(x)[0]
   if isinstance(first_x_value, np.ndarray):
-    steps, batch_size = distributed_training_utils.get_input_params(
-        model._distribution_strategy, first_x_value, steps, batch_size,
+    steps, batch_size = dist_utils.get_input_params(
+        model._distribution_strategy,
+        first_x_value,
+        steps,
+        batch_size,
         mode=ModeKeys.TEST)
   batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
   dataset = model._distribution_standardize_user_data(
@@ -165,7 +174,7 @@ def evaluate_distributed(model,
       batch_size=batch_size,
       allow_partial_batch=True)
 
-  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+  if dist_utils.is_tpu_strategy(model._distribution_strategy):
     return experimental_tpu_test_loop(
         model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
   else:
@@ -185,18 +194,21 @@ def predict_distributed(model,
                         steps=None,
                         callbacks=None):
   """Predict loop for Distribution Strategies."""
-  distributed_training_utils.validate_inputs(x, None)
+  dist_utils.validate_inputs(x, None)
   first_x_value = nest.flatten(x)[0]
   if isinstance(first_x_value, np.ndarray):
-    steps, batch_size = distributed_training_utils.get_input_params(
-        model._distribution_strategy, first_x_value, steps,
-        batch_size, mode=ModeKeys.PREDICT)
+    steps, batch_size = dist_utils.get_input_params(
+        model._distribution_strategy,
+        first_x_value,
+        steps,
+        batch_size,
+        mode=ModeKeys.PREDICT)
   batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
   dataset = model._distribution_standardize_user_data(
       x,
       batch_size=batch_size,
       allow_partial_batch=True)
-  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+  if dist_utils.is_tpu_strategy(model._distribution_strategy):
     return experimental_tpu_predict_loop(
         model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
   else:
@@ -217,11 +229,11 @@ def _per_replica_execution_function(model, mode):
 
 def _build_model(strategy, model, mode, inputs, targets=None):
   if model._compile_distribution:
-    distributed_training_utils.clone_model_on_replicas(
+    dist_utils.clone_model_on_replicas(
         model, strategy, mode, inputs=inputs, targets=targets)
   else:
-    distributed_training_utils._build_distributed_network(
-        model, strategy, mode, inputs, targets)
+    dist_utils._build_distributed_network(model, strategy, mode, inputs,
+                                          targets)
 
 
 def _make_train_step_fn(model, mode, strategy, output_labels):
@@ -257,12 +269,12 @@ def _make_train_step_fn(model, mode, strategy, output_labels):
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = strategy.extended.call_for_each_replica(
          _per_replica_execution_function,
-         args=(distributed_training_utils.get_distributed_model(model, mode),
-               mode))
+         args=(dist_utils.get_distributed_model(model, mode), mode))
     (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args)
+     all_session_args) = dist_utils.unwrap_values(strategy, grouped_inputs,
+                                                  grouped_outputs,
+                                                  grouped_updates,
+                                                  grouped_session_args)
     combined_fn = K.function(
         all_inputs,
         all_outputs,
@@ -331,11 +343,11 @@ def experimental_tpu_fit_loop(model,
   mode = ModeKeys.TRAIN
   # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
   current_strategy = model._distribution_strategy
-  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
+  iterator = dist_utils.get_iterator(dataset, current_strategy)
   steps_per_epoch = training_utils.infer_steps_for_dataset(
       dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
 
-  scope = distributed_training_utils.distributed_scope(
+  scope = dist_utils.distributed_scope(
       strategy=current_strategy, learning_phase=1)
   scope.__enter__()
 
@@ -371,7 +383,7 @@ def experimental_tpu_fit_loop(model,
   do_validation = bool(validation_steps)
 
   if model._compile_distribution:
-    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
+    dist_utils._copy_weights_to_distributed_model(model, mode)
 
   callbacks = cbks.configure_callbacks(
       callbacks,
@@ -397,7 +409,7 @@ def experimental_tpu_fit_loop(model,
   initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
 
   for epoch in range(initial_epoch, epochs):
-    distributed_training_utils._reset_metrics(model)
+    dist_utils._reset_metrics(model)
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
     step_index = 0
@@ -435,8 +447,7 @@ def experimental_tpu_fit_loop(model,
       if model._compile_distribution:
         # Since we create a new clone from the original model we need to copy
         # the weights back to the original model before we can run validation.
-        distributed_training_utils._copy_weights_to_original_model(
-            model, ModeKeys.TRAIN)
+        dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
 
       val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
           model,
@@ -457,8 +468,7 @@ def experimental_tpu_fit_loop(model,
 
   if model._compile_distribution:
     # Copy the weights back from the replicated model to the original model.
-    distributed_training_utils._copy_weights_to_original_model(
-        model, ModeKeys.TRAIN)
+    dist_utils._copy_weights_to_original_model(model, ModeKeys.TRAIN)
   scope.__exit__(None, None, None)
   return model.history
 
@@ -487,12 +497,11 @@ def experimental_tpu_test_loop(model,
   """
   mode = ModeKeys.TEST
   current_strategy = model._distribution_strategy
-  iterator = distributed_training_utils.get_iterator(dataset,
-                                                     current_strategy)
+  iterator = dist_utils.get_iterator(dataset, current_strategy)
   steps = training_utils.infer_steps_for_dataset(dataset, steps,
                                                  steps_name='steps')
 
-  scope = distributed_training_utils.distributed_scope(
+  scope = dist_utils.distributed_scope(
       strategy=current_strategy, learning_phase=0)
   scope.__enter__()
 
@@ -508,10 +517,8 @@ def experimental_tpu_test_loop(model,
     (distribution_strategy_context.get_replica_context().merge_call(
         _build_model, args=(model, mode, inputs, targets)))
 
-    (_, outputs, updates, _) = (
-        _per_replica_execution_function(
-            distributed_training_utils.get_distributed_model(model, mode),
-            mode))
+    (_, outputs, updates, _) = _per_replica_execution_function(
+        dist_utils.get_distributed_model(model, mode), mode)
     with ops.control_dependencies([updates]):
       return outputs
 
@@ -534,9 +541,9 @@ def experimental_tpu_test_loop(model,
     progbar = Progbar(target=steps)
 
   if model._compile_distribution:
-    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
+    dist_utils._copy_weights_to_distributed_model(model, mode)
 
-  distributed_training_utils._reset_metrics(model)
+  dist_utils._reset_metrics(model)
 
   callbacks = cbks.configure_callbacks(
       callbacks,
@@ -622,8 +629,7 @@ def experimental_tpu_predict_loop(model,
   mode = ModeKeys.PREDICT
   steps = training_utils.infer_steps_for_dataset(dataset, steps,
                                                  steps_name='steps')
-  dataset_fully_shaped = (distributed_training_utils.
-                          is_dataset_shape_fully_defined(dataset))
+  dataset_fully_shaped = dist_utils.is_dataset_shape_fully_defined(dataset)
   padding_handler = None
   if not dataset_fully_shaped:
     # TODO(hongjunchoi): Investigate whether operations from
@@ -647,9 +653,9 @@ def experimental_tpu_predict_loop(model,
       dataset = dataset.prefetch(prefetch_buffer)
 
   current_strategy = model._distribution_strategy
-  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
+  iterator = dist_utils.get_iterator(dataset, current_strategy)
 
-  scope = distributed_training_utils.distributed_scope(
+  scope = dist_utils.distributed_scope(
       strategy=current_strategy, learning_phase=0)
   scope.__enter__()
 
@@ -659,10 +665,8 @@ def experimental_tpu_predict_loop(model,
     (distribution_strategy_context.get_replica_context().merge_call(
         _build_model, args=(model, mode, inputs)))
 
-    (_, outputs, updates, _) = (
-        _per_replica_execution_function(
-            distributed_training_utils.get_distributed_model(model, mode),
-            mode))
+    (_, outputs, updates, _) = _per_replica_execution_function(
+        dist_utils.get_distributed_model(model, mode), mode)
 
     with ops.control_dependencies([updates]):
       return outputs
@@ -673,16 +677,16 @@ def experimental_tpu_predict_loop(model,
   predict_input_data = iterator.get_next()
   per_replica_outputs = current_strategy.experimental_run_v2(
       _predict_step_fn, args=(predict_input_data,))
-  output_tensors = distributed_training_utils.flatten_per_replica_values(
+  output_tensors = dist_utils.flatten_per_replica_values(
       current_strategy, per_replica_outputs)
 
   if verbose >= 1:
     progbar = Progbar(target=steps)
 
   if model._compile_distribution:
-    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
+    dist_utils._copy_weights_to_distributed_model(model, mode)
 
-  distributed_training_utils._reset_metrics(model)
+  dist_utils._reset_metrics(model)
 
   callbacks = cbks.configure_callbacks(
       callbacks,
@@ -756,3 +760,176 @@ def experimental_tpu_predict_loop(model,
     prediction_result = padding_handler.apply_mask(prediction_result)
 
   return prediction_result
+
+
+class DistributionMultiWorkerTrainingLoop(training_utils.TrainingLoop):
+  """Training loop for distribution strategy with multiple worker."""
+
+  def fit(self,
+          model,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          **kwargs):
+    # Multi-Worker mode runs the Keras training loop on multiple
+    # servers via the Distribute Coordinator.
+    def _worker_fn(_):
+      """Run training inside the distributed coordinator."""
+      filtered_callbacks = dist_utils.filter_distributed_callbacks(callbacks)
+      return fit_distributed(
+          model,
+          x=x,
+          y=y,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=filtered_callbacks,
+          validation_split=validation_split,
+          validation_data=validation_data,
+          shuffle=shuffle,
+          class_weight=class_weight,
+          sample_weight=sample_weight,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps,
+          validation_freq=validation_freq)
+
+    # Independent worker only for now.
+    return dc.run_distribute_coordinator(
+        _worker_fn,
+        model._distribution_strategy,
+        mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+
+  def evaluate(self,
+               model,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               **kwargs):
+
+    def _worker_fn(_):
+      """Run evaluation inside the distributed coordinator."""
+      filtered_callbacks = dist_utils.filter_distributed_callbacks(callbacks)
+      return evaluate_distributed(
+          model,
+          x=x,
+          y=y,
+          batch_size=batch_size,
+          verbose=verbose,
+          sample_weight=sample_weight,
+          steps=steps,
+          callbacks=filtered_callbacks)
+
+    # Independent worker only for now.
+    return dc.run_distribute_coordinator(
+        _worker_fn,
+        model._distribution_strategy,
+        mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+
+  def predict(self,
+              model,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              **kwargs):
+    return predict_distributed(
+        model,
+        x=x,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks)
+
+
+class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
+  """Training loop for distribution strategy with single worker."""
+
+  def fit(self,
+          model,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          **kwargs):
+    return fit_distributed(
+        model,
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_split=validation_split,
+        validation_data=validation_data,
+        shuffle=shuffle,
+        class_weight=class_weight,
+        sample_weight=sample_weight,
+        initial_epoch=initial_epoch,
+        steps_per_epoch=steps_per_epoch,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq)
+
+  def evaluate(self,
+               model,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               **kwargs):
+    return evaluate_distributed(
+        model,
+        x=x,
+        y=y,
+        batch_size=batch_size,
+        verbose=verbose,
+        sample_weight=sample_weight,
+        steps=steps,
+        callbacks=callbacks)
+
+  def predict(self,
+              model,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              **kwargs):
+    return predict_distributed(
+        model,
+        x=x,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks)
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index f1d1f769467..ce976b1847d 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -550,3 +550,291 @@ def _get_num_samples_or_steps(data, steps_per_epoch):
   if hasattr(flat_inputs[0], 'shape'):
     return int(flat_inputs[0].shape[0]), False
   return steps_per_epoch, True
+
+
+class GeneratorOrSequenceTrainingLoop(training_utils.TrainingLoop):
+  """Generator-like.
+
+  Input is Python generator, or Sequence object.
+
+  The difference between this class and `GeneratorLikeTrainingFunction` is that
+  this class only handles inputs that with x, y and sample_weight fused into one
+  param.
+  """
+
+  def fit(self,
+          model,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          max_queue_size=10,
+          workers=1,
+          use_multiprocessing=False):
+    model._validate_or_infer_batch_size(batch_size, steps_per_epoch, x)
+    training_utils.check_generator_arguments(
+        y, sample_weight, validation_split=validation_split)
+    return fit_generator(
+        model,
+        x,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq,
+        class_weight=class_weight,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch,
+        steps_name='steps_per_epoch')
+
+  def evaluate(self,
+               model,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               max_queue_size=10,
+               workers=1,
+               use_multiprocessing=False):
+    model._validate_or_infer_batch_size(batch_size, steps, x)
+    training_utils.check_generator_arguments(y, sample_weight)
+    return evaluate_generator(
+        model,
+        x,
+        steps=steps,
+        verbose=verbose,
+        callbacks=callbacks,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing)
+
+  def predict(self,
+              model,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              max_queue_size=10,
+              workers=1,
+              use_multiprocessing=False):
+    model._validate_or_infer_batch_size(batch_size, steps, x)
+    return predict_generator(
+        model,
+        x,
+        steps=steps,
+        verbose=verbose,
+        callbacks=callbacks,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing)
+
+
+class EagerDatasetOrIteratorTrainingLoop(training_utils.TrainingLoop):
+  """A non-distributed Dataset or iterator in eager execution."""
+
+  def fit(self,
+          model,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          **kwargs):
+    model._validate_or_infer_batch_size(batch_size, steps_per_epoch, x)
+    # Make sure that y, sample_weights, validation_split are not passed.
+    training_utils.validate_dataset_input(x, y, sample_weight, validation_split)
+    if (isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)) and
+        shuffle):
+      training_utils.verify_dataset_shuffled(x)
+
+    return fit_generator(
+        model,
+        x,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq,
+        class_weight=class_weight,
+        workers=0,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch,
+        steps_name='steps_per_epoch')
+
+  def evaluate(self,
+               model,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               **kwargs):
+    model._validate_or_infer_batch_size(batch_size, steps, x)
+    # Make sure that y, sample_weights, validation_split are not passed.
+    training_utils.validate_dataset_input(x, y, sample_weight)
+    return evaluate_generator(
+        model, x, steps=steps, verbose=verbose, workers=0, callbacks=callbacks)
+
+  def predict(self,
+              model,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              **kwargs):
+    model._validate_or_infer_batch_size(batch_size, steps, x)
+    return predict_generator(
+        model, x, steps=steps, verbose=verbose, workers=0, callbacks=callbacks)
+
+
+class GeneratorLikeTrainingLoop(training_utils.TrainingLoop):
+  """TrainingLoop that handle inputs like python generator.
+
+  This is the default handler for most of the input data types, includes
+  symbolic tensors or Numpy array-like, Datasets and iterators in graph mode
+  (since they generate symbolic tensors). This Function is used to handle model
+  with `run_eagerly` = True.
+  """
+
+  def fit(self,
+          model,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          **kwargs):
+    batch_size = model._validate_or_infer_batch_size(batch_size,
+                                                     steps_per_epoch, x)
+    x, y, sample_weights = model._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        class_weight=class_weight,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps_per_epoch',
+        steps=steps_per_epoch,
+        validation_split=validation_split,
+        shuffle=shuffle)
+
+    if validation_data:
+      validation_data = model._prepare_validation_data(validation_data,
+                                                       batch_size,
+                                                       validation_steps)
+    elif validation_split and 0. < validation_split < 1.:
+      (x, y, sample_weights, val_x, val_y,
+       val_sample_weights) = model._split_training_and_validation_data(
+           x, y, sample_weights, validation_split)
+      validation_data = (val_x, val_y, val_sample_weights)
+    else:
+      if validation_steps:
+        raise ValueError('`validation_steps` should not be specified if '
+                         '`validation_data` is None.')
+
+    return fit_generator(
+        model, (x, y, sample_weights),
+        steps_per_epoch=steps_per_epoch,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq,
+        workers=0,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch,
+        steps_name='steps_per_epoch')
+
+  def evaluate(self,
+               model,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               **kwargs):
+    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+    x, y, sample_weights = model._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps',
+        steps=steps)
+    return evaluate_generator(
+        model, (x, y, sample_weights),
+        steps=steps,
+        batch_size=batch_size,
+        verbose=verbose,
+        workers=0,
+        callbacks=callbacks)
+
+  def predict(self,
+              model,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              **kwargs):
+    batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+    x, _, _ = model._standardize_user_data(
+        x, check_steps=True, steps_name='steps', steps=steps)
+    return predict_generator(
+        model,
+        x,
+        steps=steps,
+        batch_size=batch_size,
+        verbose=verbose,
+        workers=0,
+        callbacks=callbacks)
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index c3410cb4d4f..b5c9e0578a5 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -997,7 +997,7 @@ class TrainingTest(keras_parameterized.TestCase):
   def test_validation_freq(self, validation_freq, expected_runs):
     x, y = np.ones((10, 10)), np.ones((10, 1))
     model = testing_utils.get_small_mlp(2, 1, 10)
-    model.compile('sgd', 'mse')
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
 
     class ValCounter(keras.callbacks.Callback):
 
@@ -1022,7 +1022,7 @@ class TrainingTest(keras_parameterized.TestCase):
   def test_validation_steps_without_data(self):
     x, y = np.ones((10, 10)), np.ones((10, 1))
     model = testing_utils.get_small_mlp(2, 1, 10)
-    model.compile('sgd', 'mse')
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
 
     with self.assertRaisesRegexp(
         ValueError, '`validation_steps` should not be specified if '
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 6ed0edafb56..1144a49df81 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
@@ -262,6 +263,9 @@ def standardize_single_array(x, expected_shape=None):
   if x is None:
     return None
 
+  if composite_tensor_utils.is_composite_or_composite_value(x):
+    return x
+
   if (x.shape is not None and len(x.shape) == 1 and
       (expected_shape is None or len(expected_shape) != 1)):
     if tensor_util.is_tensor(x):
@@ -329,6 +333,7 @@ def standardize_input_data(data,
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
+
   if shapes is not None:
     data = [
         standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
@@ -366,8 +371,11 @@ def standardize_input_data(data,
           if not tensorshape:
             continue
           data_shape = tuple(tensorshape.as_list())
+        elif composite_tensor_utils.is_composite_or_composite_value(data[i]):
+          data_shape = composite_tensor_utils.get_shape(data[i])
         else:
           data_shape = data[i].shape
+
         shape = shapes[i]
         if len(data_shape) != len(shape):
           raise ValueError('Error when checking ' + exception_prefix +
@@ -1470,9 +1478,9 @@ class ModelInputs(object):
         if dtype.is_floating:
           dtype = K.floatx()
         v = K.placeholder(shape=shape, name=k, dtype=dtype)
-      elif isinstance(v, tensor_shape.TensorShape):
-        shape = (None,) + tuple(v.as_list()[1:])
-        v = K.placeholder(shape=shape, name=k)
+      elif isinstance(v, tensor_spec.TensorSpec):
+        shape = (None,) + tuple(v.shape.as_list()[1:])
+        v = K.placeholder(shape=shape, name=k, dtype=v.dtype)
 
       self._flattened_inputs[i] = v
 
@@ -1594,3 +1602,58 @@ def should_run_validation(validation_freq, epoch):
     raise ValueError('`validation_freq` must be an Integer or '
                      '`collections.Container` (e.g. list, tuple, etc.)')
   return one_indexed_epoch in validation_freq
+
+
+class TrainingLoop(object):
+  """TrainingLoop is a wrapper class around the training logic.
+
+  This class is trying to encapsulate the different logic of fit/eval/predict
+  with regard to different data input and model condition.
+
+  Note that TrainingLoop is stateless, which means it doesn't contain any
+  internal field and can be reused with different model and inputs.
+  """
+
+  def fit(self,
+          model,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          **kwargs):
+    """Train the model with the inputs and targets."""
+    raise NotImplementedError()
+
+  def evaluate(self,
+               model,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               **kwargs):
+    """Returns the loss value & metrics values for the model in test mode."""
+    raise NotImplementedError()
+
+  def predict(self,
+              model,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              **kwargs):
+    raise NotImplementedError()
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
index f505ced038b..0b6beb8ddcc 100644
--- a/tensorflow/python/keras/keras_parameterized.py
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -173,10 +173,10 @@ def _test_sequential_model_type(f, test_or_class, *args, **kwargs):
     f(test_or_class, *args, **kwargs)
 
 
-def run_all_keras_modes(
-    test_or_class=None,
-    config=None,
-    always_skip_v1=False):
+def run_all_keras_modes(test_or_class=None,
+                        config=None,
+                        always_skip_v1=False,
+                        always_skip_eager=False):
   """Execute the decorated test with all keras execution modes.
 
   This decorator is intended to be applied either to individual test methods in
@@ -231,6 +231,8 @@ def run_all_keras_modes(
       session when executing graphs.
     always_skip_v1: If True, does not try running the legacy graph mode even
       when Tensorflow v2 behavior is not enabled.
+    always_skip_eager: If True, does not execute the decorated test
+      with eager execution modes.
 
   Returns:
     Returns a decorator that will run the decorated test method multiple times.
@@ -239,8 +241,10 @@ def run_all_keras_modes(
     ImportError: If abseil parameterized is not installed or not included as
       a target dependency.
   """
-  params = [('_v2_eager', 'v2_eager'),
-            ('_v2_function', 'v2_function')]
+
+  params = [('_v2_function', 'v2_function')]
+  if not always_skip_eager:
+    params.append(('_v2_eager', 'v2_eager'))
   if not (always_skip_v1 or tf2.enabled()):
     params.append(('_v1_graph', 'v1_graph'))
 
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 8189dd2ebb3..fb8cecd5f03 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -2405,7 +2405,7 @@ class Cropping2D(Layer):
   model.add(Cropping2D(cropping=((2, 2), (4, 4)),
                        input_shape=(28, 28, 3)))
   # now model.output_shape == (None, 24, 20, 3)
-  model.add(Conv2D(64, (3, 3), padding='same))
+  model.add(Conv2D(64, (3, 3), padding='same'))
   model.add(Cropping2D(cropping=((2, 2), (2, 2))))
   # now model.output_shape == (None, 20, 16. 64)
   ```
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 030908e51a2..408b5fec0c9 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -395,7 +395,7 @@ class ConvRNN2D(RNN):
       updates = []
       for i in range(len(states)):
         updates.append(K.update(self.states[i], states[i]))
-      self.add_update(updates, inputs=True)
+      self.add_update(updates)
 
     if self.return_sequences:
       output = outputs
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index 0a8cf9741b2..37650752861 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -113,7 +113,7 @@ class _CuDNNRNN(RNN):
       updates = []
       for i in range(len(states)):
         updates.append(state_ops.assign(self.states[i], states[i]))
-      self.add_update(updates, inputs)
+      self.add_update(updates)
 
     if self.return_state:
       return [output] + states
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
index 799d09c395f..6cb446dd71c 100644
--- a/tensorflow/python/keras/layers/kernelized.py
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -112,7 +112,7 @@ class RandomFourierFeatures(base_layer.Layer):
       definitions above). When provided, it should be a positive float. If None,
       the implementation chooses a default value (1.0 typically). Both the
       approximation error of the kernel and the classification quality are
-      sensitive to this parameter. If trainable is set to True, this paramater
+      sensitive to this parameter. If trainable is set to True, this parameter
       is learned end-to-end during training and the provided value serves as an
       initialization value.
       NOTE: When this layer is used to map the initial features and then the
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 2f8b13cb459..00c4cc52c24 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -530,8 +530,8 @@ class BatchNormalizationBase(Layer):
           return self._assign_moving_average(self.moving_variance, variance,
                                              momentum, inputs_size)
 
-      self.add_update(mean_update, inputs=True)
-      self.add_update(variance_update, inputs=True)
+      self.add_update(mean_update)
+      self.add_update(variance_update)
 
     return output
 
@@ -779,8 +779,8 @@ class BatchNormalizationBase(Layer):
           false_branch = lambda: self.moving_variance
           return tf_utils.smart_cond(training, true_branch, false_branch)
 
-      self.add_update(mean_update, inputs=True)
-      self.add_update(variance_update, inputs=True)
+      self.add_update(mean_update)
+      self.add_update(variance_update)
 
     else:
       mean, variance = self.moving_mean, self.moving_variance
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 1f5c4a2fafd..b00257c7ce2 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -25,6 +25,7 @@ from tensorflow.python import keras
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -213,6 +214,29 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     if context.executing_eagerly():
       self.assertAlmostEqual(test_loss.numpy(), train_loss.numpy())
 
+  def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.bn = keras.layers.BatchNormalization()
+
+      @def_function.function()
+      def call(self, x, training):
+        return self.bn(x, training=training)
+
+    with context.eager_mode():
+      model = MyModel()
+
+      for _ in range(10):
+        x = constant_op.constant(0.5, shape=[1, 1])
+        model(x, training=True)
+
+      # Make sure the moving mean and variance have been updated
+      self.assertAllClose(model.bn.moving_mean.numpy(), [0.047], atol=3e-3)
+      self.assertAllClose(model.bn.moving_variance.numpy(), [0.9], atol=3e-2)
+
 
 class BatchNormalizationV1Test(test.TestCase):
 
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index ca0d68e61e0..24a499f429a 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -744,7 +745,7 @@ class RNN(Layer):
       updates = []
       for state_, state in zip(nest.flatten(self.states), nest.flatten(states)):
         updates.append(state_ops.assign(state_, state))
-      self.add_update(updates, inputs)
+      self.add_update(updates)
 
     if self.return_sequences:
       output = outputs
@@ -2122,7 +2123,13 @@ class LSTMCell(DropoutRNNCellMixin, Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
-    self.state_size = [self.units, self.units]
+    # tuple(_ListWrapper) was silently dropping list content in at least 2.7.10,
+    # and fixed after 2.7.16. Converting the state_size to wrapper around
+    # NoDependency(), so that the base_layer.__setattr__ will not convert it to
+    # ListWrapper. Down the stream, self.states will be a list since it is
+    # generated from nest.map_structure with list, and tuple(list) will work
+    # properly.
+    self.state_size = data_structures.NoDependency([self.units, self.units])
     self.output_size = self.units
 
   @tf_utils.shape_type_conversion
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index 243afc79150..43ddb9d84e0 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.layers import recurrent as rnn_v1
 from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
 from tensorflow.python.ops import array_ops
@@ -601,15 +602,17 @@ class RNNTest(keras_parameterized.TestCase):
     cells = [keras.layers.LSTMCell(1),
              keras.layers.LSTMCell(1)]
     layer = keras.layers.RNN(cells)
-    layer.build((None, None, 1))
-
     x = keras.Input((None, 1))
+    _ = layer(x)
+
     update_1 = state_ops.assign_add(cells[0].kernel,
                                     x[0, 0, 0] * cells[0].kernel)
     update_2 = state_ops.assign_add(cells[0].kernel,
                                     array_ops.ones_like(cells[0].kernel))
-    cells[0].add_update(update_1, inputs=x)
-    cells[0].add_update(update_2)
+    # TODO(b/128682878): Remove when RNNCells are __call__'d.
+    with base_layer_utils.call_context().enter(layer, x, True):
+      cells[0].add_update(update_1, inputs=x)
+      cells[0].add_update(update_2)
     self.assertEqual(len(layer.updates), 2)
     self.assertEqual(len(layer.get_updates_for(None)), 1)
     self.assertEqual(len(layer.get_updates_for(x)), 1)
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 903de3d1a09..ad691654ea9 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -344,7 +344,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
 
     if self.stateful:
       updates = [state_ops.assign(self.states[0], states[0])]
-      self.add_update(updates, inputs)
+      self.add_update(updates)
 
     if self.return_sequences:
       output = outputs
@@ -909,7 +909,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       updates = []
       for i in range(len(states)):
         updates.append(state_ops.assign(self.states[i], states[i]))
-      self.add_update(updates, inputs)
+      self.add_update(updates)
 
     if self.return_sequences:
       output = outputs
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index b02f055d165..1f6e1d113b1 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -23,6 +23,7 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _standardize_args
@@ -262,6 +263,8 @@ class TimeDistributed(Wrapper):
     if (hasattr(self.layer, 'activity_regularizer') and
         self.layer.activity_regularizer is not None):
       regularization_loss = self.layer.activity_regularizer(y)
+      base_layer_utils.check_graph_consistency(
+          regularization_loss, method='activity_regularizer')
       self.add_loss(regularization_loss, inputs)
     return y
 
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 6b610afc30d..ec98f35ae0a 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops.array_ops import concat
 from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import object_identity
@@ -591,10 +592,12 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
       assert not layer.updates
       assert not layer.get_updates_for(None)
       assert not layer.get_updates_for(x)
-      layer.forward_layer.add_update(x_reachable_update, inputs=x)
-      layer.forward_layer.add_update(1, inputs=None)
-      layer.backward_layer.add_update(x_reachable_update, inputs=x)
-      layer.backward_layer.add_update(1, inputs=None)
+      # TODO(b/128684069): Remove when Wrapper sublayers are __call__'d.
+      with base_layer_utils.call_context().enter(layer, x, True):
+        layer.forward_layer.add_update(x_reachable_update, inputs=x)
+        layer.forward_layer.add_update(1, inputs=None)
+        layer.backward_layer.add_update(x_reachable_update, inputs=x)
+        layer.backward_layer.add_update(1, inputs=None)
       assert len(layer.updates) == 4
       assert len(layer.get_updates_for(None)) == 2
       assert len(layer.get_updates_for(x)) == 2
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index bcfad5613a4..a6777b103bd 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -985,6 +985,28 @@ def kullback_leibler_divergence(y_true, y_pred):  # pylint: disable=missing-docs
 
 @keras_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
+  """Computes the Poisson loss between y_true and y_pred.
+
+  The Poisson loss is the mean of the elements of the `Tensor`
+  `y_pred - y_true * log(y_pred)`.
+
+  Usage:
+
+  ```python
+  loss = tf.keras.losses.poisson([1.4, 9.3, 2.2], [4.3, 8.2, 12.2])
+  print('Loss: ', loss.numpy())  # Loss: -0.8045559
+  ```
+
+  Args:
+    y_true: Tensor of true targets.
+    y_pred: Tensor of predicted targets.
+
+  Returns:
+    A `Tensor` with the mean Poisson loss.
+
+  Raises:
+      InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes.
+  """
   y_pred = ops.convert_to_tensor(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 201da56a63e..08fff9a4e75 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -406,6 +406,14 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertEqual(mape_obj.name, 'mape_1')
     self.assertEqual(mape_obj.reduction, losses_utils.ReductionV2.SUM)
 
+  def test_all_correct_unweighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
   def test_unweighted(self):
     mape_obj = keras.losses.MeanAbsolutePercentageError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
@@ -453,6 +461,17 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     loss = mape_obj(y_true, y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
+  def test_no_reduction(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError(
+        reduction=losses_utils.ReductionV2.NONE)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred, sample_weight=2.3)
+    loss = self.evaluate(loss)
+    self.assertArrayNear(loss, [621.8518, 352.6666], 1e-3)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class MeanSquaredLogarithmicErrorTest(test.TestCase):
@@ -863,6 +882,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 @test_util.run_all_in_graph_and_eager_modes
 class SparseCategoricalCrossentropyTest(test.TestCase):
 
+  def test_config(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(
+        reduction=losses_utils.ReductionV2.SUM, name='scc')
+    self.assertEqual(cce_obj.name, 'scc')
+    self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
+
   def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
     y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 9c9ca564696..70184fa89da 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -219,7 +219,7 @@ class Metric(Layer):
       *args:
       **kwargs: A mini-batch of inputs to the Metric.
     """
-    NotImplementedError('Must be implemented in subclasses.')
+    raise NotImplementedError('Must be implemented in subclasses.')
 
   @abc.abstractmethod
   def result(self):
@@ -228,7 +228,7 @@ class Metric(Layer):
     Result computation is an idempotent operation that simply calculates the
     metric value using the state variables.
     """
-    NotImplementedError('Must be implemented in subclasses.')
+    raise NotImplementedError('Must be implemented in subclasses.')
 
   ### For use by subclasses ###
   @doc_controls.for_subclass_implementers
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index a7acc76e455..d2dcd1d68fc 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -16,9 +16,10 @@
 # Description:
 #   Contains the Keras Mixed Precision API (TensorFlow version).
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -43,6 +44,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework",
+        "//tensorflow/python:mixed_precision_global_state",
     ],
 )
 
@@ -58,6 +60,8 @@ py_test(
         ":policy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/keras/mixed_precision/experimental:loss_scale_optimizer",
+        "//tensorflow/python/keras/optimizer_v2",
     ],
 )
 
@@ -114,6 +118,7 @@ cuda_py_test(
         "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/keras",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -137,4 +142,5 @@ cuda_py_test(
         "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/keras",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/experimental/policy.py
index e8e9b6219eb..d90906f4bc9 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import contextlib
 
+from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -125,8 +126,14 @@ class Policy(object):
   # TODO(reedwm): Implement get_config/from_config.
 
 
+# The policy in effect when TensorFlow starts. This is constant and never
+# changes.
+_default_policy = Policy('infer')
+
+# The current global policy in effect. This starts as the default policy, but
+# can be changed with `set_policy`.
 # TODO(reedwm): Make this thread local?
-_global_policy = Policy('infer')
+_global_policy = _default_policy
 
 
 @keras_export('keras.mixed_precision.experimental.global_policy')
@@ -143,13 +150,34 @@ def global_policy():
   return _global_policy
 
 
+def _check_if_mixed_precision_graph_rewrite_is_enabled():
+  # TODO(reedwm): Update this comment once the Keras API is complete.
+  if mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled:
+    raise ValueError(
+        'The mixed precision policy cannot be set, because the mixed '
+        'precision graph rewrite has already been enabled.\n'
+        'At most, one of the following functions can be called:\n\n'
+        '  1. tf.train.experimental.enable_mixed_precision_graph_rewrite() '
+        '(You called this first)\n'
+        '  2. tf.keras.mixed_precision.experimental.set_policy() (You called '
+        'this second)\n\n'
+        'You called both functions, which is an error, because both functions '
+        'enable you to use mixed precision. The first function enables mixed '
+        'precision in the graph with a graph rewrite. However it is currently '
+        'not very customizable, and does not support eager. The second '
+        'function is for Keras layers, but is not yet fully complete.')
+
+
 @keras_export('keras.mixed_precision.experimental.set_policy')
 def set_policy(policy):
   """Sets the global Policy."""
   global _global_policy
+  _check_if_mixed_precision_graph_rewrite_is_enabled()
   if not isinstance(policy, Policy):
     policy = Policy(policy)
   _global_policy = policy
+  mixed_precision_global_state.using_default_mixed_precision_policy = (
+      _global_policy is _default_policy)
 
 
 # TODO(reedwm): Make this thread local
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
index 278f5211044..a48ecd7c5c9 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
@@ -21,7 +21,9 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.mixed_precision.experimental import policy as mp_policy
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
+from tensorflow.python.training.experimental import mixed_precision
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -65,5 +67,17 @@ class PolicyTest(test.TestCase):
       self.assertEqual(mp_policy.global_policy().name, 'infer_float32_vars')
     self.assertEqual(mp_policy.global_policy().name, 'infer')
 
+  def test_error_if_graph_rewrite_enabled(self):
+    try:
+      mixed_precision.enable_mixed_precision_graph_rewrite(
+          gradient_descent.SGD(1.))
+      with self.assertRaisesRegexp(
+          ValueError, 'the mixed precision graph rewrite has already been '
+                      'enabled'):
+        mp_policy.set_policy('infer_float32_vars')
+    finally:
+      mixed_precision.disable_mixed_precision_graph_rewrite()
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 9f8d6abbbc2..d699daf6b48 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -480,7 +480,8 @@ def in_place_subclassed_model_state_restoration(model):
 
 def clone_and_build_model(
     model, input_tensors=None, target_tensors=None, custom_objects=None,
-    compile_clone=True, in_place_reset=False, optimizer_iterations=None):
+    compile_clone=True, in_place_reset=False, optimizer_iterations=None,
+    optimizer_config=None):
   """Clone a `Model` and build/compile it with the same settings used before.
 
   This function can be be run in the same graph or in a separate graph from the
@@ -508,6 +509,10 @@ def clone_and_build_model(
       optimizer if the clone is compiled. This argument is used when a Keras
       model is cloned into an Estimator model function, because Estimators
       create their own global step variable.
+    optimizer_config: Optimizer config dictionary returned from `get_config()`.
+      This argument should be defined if `clone_and_build_model` is called in
+      a different graph or session from the original model, and the optimizer is
+      an instance of `OptimizerV2`.
 
   Returns:
     Clone of the model.
@@ -562,7 +567,7 @@ def clone_and_build_model(
           orig_optimizer.optimizer, optimizer_iterations)
       K.track_tf_optimizer(optimizer)
     else:
-      optimizer_config = orig_optimizer.get_config()
+      optimizer_config = optimizer_config or orig_optimizer.get_config()
       optimizer = orig_optimizer.__class__.from_config(optimizer_config)
       if optimizer_iterations is not None:
         optimizer.iterations = optimizer_iterations
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 0ef7323fe5e..4f8758ed5d0 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -352,10 +352,10 @@ class TestModelDeepCopy(test.TestCase):
                       model_copy.get_weights()[0]))
 
 
-@keras_parameterized.run_all_keras_modes
 class TestCloneAndBuildModel(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_clone_and_build_non_compiled_model(self):
     inp = np.random.random((10, 4))
     out = np.random.random((10, 4))
@@ -436,6 +436,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
     new_model.evaluate(inp, out)
 
   @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_clone_and_build_compiled(self):
     model = _get_model()
     model.compile(
@@ -445,6 +446,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
 
     self._clone_and_build_test_helper(model, testing_utils.get_model_type())
 
+  @keras_parameterized.run_all_keras_modes
   def test_clone_and_build_sequential_without_inputs_defined(self):
     model = models.Sequential(_get_layers(input_shape=None))
     model.compile(
@@ -476,10 +478,12 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
     self.assertEqual(K.eval(global_step), 124)
 
   @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_replace_tf_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases(adam.AdamOptimizer(0.01))
 
   @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_replace_keras_optimizer_iterations_variable(self):
     if testing_utils.should_run_eagerly():
       # This needs to be updated to run with v2 optimizers.
@@ -487,6 +491,29 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
 
     self.assert_optimizer_iterations_increases('adam')
 
+  def test_clone_optimizer_in_different_graph(self):
+    with ops.Graph().as_default():
+      with self.session():
+        model = testing_utils.get_small_sequential_mlp(3, 4)
+        optimizer = keras.optimizer_v2.adam.Adam()
+        model.compile(
+            optimizer, 'mse', metrics=['acc', metrics.categorical_accuracy],
+            )
+        model.fit(
+            x=np.array([[1., 2., 3., 4.]]),
+            y=np.array([[1., 1., 1., 1.]]),
+            epochs=1)
+        optimizer_config = optimizer.get_config()
+    with ops.Graph().as_default():
+      with self.session():
+        with self.assertRaisesRegexp(ValueError,
+                                     'Cannot use the given session'):
+          models.clone_and_build_model(model, compile_clone=True)
+        # The optimizer_config object allows the model to be cloned in a
+        # different graph.
+        models.clone_and_build_model(model, compile_clone=True,
+                                     optimizer_config=optimizer_config)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index 472672b8526..1a5799d6265 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Contains the Keras OptimizerV2 API (internal TensorFlow version).
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
@@ -240,6 +241,7 @@ cuda_py_test(
         "//tensorflow/python/keras",
     ],
     shard_count = 4,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index ce798744bfa..f430acbaccb 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -54,7 +54,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
   """
 
   def __init__(self,
-               learning_rate,
+               learning_rate=0.001,
                learning_rate_power=-0.5,
                initial_accumulator_value=0.1,
                l1_regularization_strength=0.0,
@@ -225,9 +225,9 @@ class Ftrl(optimizer_v2.OptimizerV2):
         'learning_rate_power':
             self._serialize_hyperparameter('learning_rate_power'),
         'l1_regularization_strength':
-            self._serializer_hyperparameter('l1_regularization_strength'),
+            self._serialize_hyperparameter('l1_regularization_strength'),
         'l2_regularization_strength':
-            self._serializer_hyperparameter('l2_regularization_strength'),
+            self._serialize_hyperparameter('l2_regularization_strength'),
         'l2_shrinkage_regularization_strength':
             self._l2_shrinkage_regularization_strength,
     })
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 0adacd25859..c9ae516ae5f 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -36,6 +36,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -257,7 +258,10 @@ class OptimizerV2(trackable.Trackable):
         raise ValueError("Expected {} >= 0, received: {}".format(k, kwargs[k]))
 
     self._use_locking = True
-    self._name = name
+    self._init_set_name(name)
+    # in graph mode, name_scope performs uniquification, so keep scope_context.
+    with backend.name_scope(self._name) as name_scope:
+      self._scope_ctx = name_scope
     self._hyper = {}
     # dict: {variable name : {slot name : variable}}
     self._slots = {}
@@ -349,15 +353,16 @@ class OptimizerV2(trackable.Trackable):
     if callable(var_list):
       var_list = var_list()
     var_list = nest.flatten(var_list)
-    grads = tape.gradient(loss_value, var_list, grad_loss)
+    with backend.name_scope(self._scope_ctx):
+      grads = tape.gradient(loss_value, var_list, grad_loss)
 
-    if hasattr(self, "clipnorm"):
-      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
-    if hasattr(self, "clipvalue"):
-      grads = [
-          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
-          for g in grads
-      ]
+      if hasattr(self, "clipnorm"):
+        grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
+      if hasattr(self, "clipvalue"):
+        grads = [
+            clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
+            for g in grads
+        ]
 
     grads_and_vars = list(zip(grads, var_list))
     self._assert_valid_dtypes([
@@ -382,22 +387,22 @@ class OptimizerV2(trackable.Trackable):
         function not implemented).
     """
     params = nest.flatten(params)
-    with backend.get_graph().as_default():
+    with backend.get_graph().as_default(), backend.name_scope(self._scope_ctx):
       grads = gradients.gradients(loss, params)
-    for grad, param in zip(grads, params):
-      if grad is None:
-        raise ValueError("Variable {} has `None` for gradient. "
-                         "Please make sure that all of your ops have a "
-                         "gradient defined (i.e. are differentiable). "
-                         "Common ops without gradient: "
-                         "K.argmax, K.round, K.eval.".format(param))
-    if hasattr(self, "clipnorm"):
-      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
-    if hasattr(self, "clipvalue"):
-      grads = [
-          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
-          for g in grads
-      ]
+      for grad, param in zip(grads, params):
+        if grad is None:
+          raise ValueError("Variable {} has `None` for gradient. "
+                           "Please make sure that all of your ops have a "
+                           "gradient defined (i.e. are differentiable). "
+                           "Common ops without gradient: "
+                           "K.argmax, K.round, K.eval.".format(param))
+      if hasattr(self, "clipnorm"):
+        grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
+      if hasattr(self, "clipvalue"):
+        grads = [
+            clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
+            for g in grads
+        ]
     return grads
 
   def apply_gradients(self, grads_and_vars, name=None):
@@ -422,16 +427,19 @@ class OptimizerV2(trackable.Trackable):
     grads_and_vars = _filter_grads(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
 
-    # Create iteration if necessary.
-    with ops.init_scope():
-      _ = self.iterations
-      self._create_hypers()
-      self._create_slots(var_list)
+    with backend.name_scope(self._scope_ctx):
+      # Create iteration if necessary.
+      with ops.init_scope():
+        _ = self.iterations
+        self._create_hypers()
+        self._create_slots(var_list)
 
-    self._prepare(var_list)
+      self._prepare(var_list)
 
-    return distribute_ctx.get_replica_context().merge_call(
-        self._distributed_apply, args=(grads_and_vars,), kwargs={"name": name})
+      return distribute_ctx.get_replica_context().merge_call(
+          self._distributed_apply,
+          args=(grads_and_vars,),
+          kwargs={"name": name})
 
   def _distributed_apply(self, distribution, grads_and_vars, name):
     """`apply_gradients` using a `DistributionStrategy`."""
@@ -764,6 +772,14 @@ class OptimizerV2(trackable.Trackable):
 
     return variable
 
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      self._name = backend.unique_object_name(
+          generic_utils.to_snake_case(self.__class__.__name__),
+          zero_based=zero_based)
+    else:
+      self._name = name
+
   def _assert_valid_dtypes(self, tensors):
     """Asserts tensors are all valid types (see `_valid_dtypes`).
 
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 2751fbc1954..c0575c39cf3 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -78,8 +78,13 @@ def trace_model_call(model, input_signature=None):
     flat_input_names = nest.flatten(input_names)
     flat_input_specs = []
     for input_tensor, input_name in zip(flat_inputs, flat_input_names):
+      # If the user has not explicitly provided the input_signature, we
+      # create it from the inputs. We make sure to set the first dimension
+      # (batch) to None here, as in serving or retraining, batch should not
+      # be fixed. See b/132783590 for context.
+      input_shape = [None] + input_tensor.shape[1:].as_list()
       flat_input_specs.append(tensor_spec.TensorSpec(
-          shape=input_tensor.shape, dtype=input_tensor.dtype,
+          shape=input_shape, dtype=input_tensor.dtype,
           name=input_name))
     input_specs = nest.pack_sequence_as(structure=inputs,
                                         flat_sequence=flat_input_specs)
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
index 65105f80f12..390baa4b811 100644
--- a/tensorflow/python/keras/saving/saving_utils_test.py
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -26,17 +26,21 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -58,7 +62,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
       self.assertAllClose(expected, actual)
 
   @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
+  @test_util.run_in_graph_and_eager_modes
   def test_trace_model_outputs(self):
     input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
     model = testing_utils.get_small_mlp(10, 3, input_dim)
@@ -81,7 +85,8 @@ class TraceModelCallTest(keras_parameterized.TestCase):
   def test_trace_model_outputs_after_fitting(self):
     input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
     model = testing_utils.get_small_mlp(10, 3, input_dim)
-    model.compile(optimizer='sgd', loss='mse')
+    model.compile(optimizer='sgd', loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x=np.random.random((8, 5)),
               y=np.random.random((8, 3)), epochs=2)
 
@@ -118,7 +123,8 @@ class TraceModelCallTest(keras_parameterized.TestCase):
                                    'input shapes have not been set'):
         saving_utils.trace_model_call(model)
 
-    model.compile(optimizer='sgd', loss='mse')
+    model.compile(optimizer='sgd', loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x=[np.random.random((8, input_dim)).astype(np.float32),
                  np.random.random((8, input_dim)).astype(np.float32)],
               y=[np.random.random((8, num_classes)).astype(np.float32),
@@ -133,7 +139,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
 
     self._assert_all_close(expected_outputs, signature_outputs)
 
-  @keras_parameterized.run_all_keras_modes
+  @test_util.run_in_graph_and_eager_modes
   def test_trace_features_layer(self):
     columns = [feature_column_v2.numeric_column('x')]
     model = sequential.Sequential(
@@ -154,7 +160,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
     self.assertAllClose({'output_1': [[1., 2.]]},
                         fn({'x': [[1.]], 'y': [[2.]]}))
 
-  @keras_parameterized.run_all_keras_modes
+  @test_util.run_in_graph_and_eager_modes
   def test_specify_input_signature(self):
     model = testing_utils.get_small_sequential_mlp(10, 3, None)
     inputs = array_ops.ones((8, 5))
@@ -168,7 +174,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
     expected_outputs = {model.output_names[0]: model(inputs)}
     self._assert_all_close(expected_outputs, signature_outputs)
 
-  @keras_parameterized.run_all_keras_modes
+  @test_util.run_in_graph_and_eager_modes
   def test_subclassed_model_with_input_signature(self):
 
     class Model(keras.Model):
@@ -192,6 +198,36 @@ class TraceModelCallTest(keras_parameterized.TestCase):
     signature_outputs = fn([x, y])
     self._assert_all_close(expected_outputs, signature_outputs)
 
+  @keras_parameterized.run_with_all_model_types
+  @test_util.run_in_graph_and_eager_modes
+  def test_model_with_fixed_input_dim(self):
+    """Ensure that the batch_dim is removed when saving.
+
+    When serving or retraining, it is important to reset the batch dim.
+    This can be an issue inside of tf.function. See b/132783590 for context.
+    """
+    model = testing_utils.get_small_mlp(10, 3, 5)
+
+    loss_object = keras.losses.MeanSquaredError()
+    optimizer = gradient_descent.SGD()
+
+    @def_function.function
+    def train_step(data, labels):
+      with backprop.GradientTape() as tape:
+        predictions = model(data)
+        loss = loss_object(labels, predictions)
+      gradients = tape.gradient(loss, model.trainable_variables)
+      optimizer.apply_gradients(zip(gradients, model.trainable_variables))
+
+    x = np.random.random((8, 5))
+    y = np.random.random((8, 3))
+
+    train_step(x, y)
+
+    fn = saving_utils.trace_model_call(model)
+    self.assertEqual(fn.input_signature[0].shape.as_list(),
+                     tensor_shape.TensorShape([None, 5]).as_list())
+
 
 def _import_and_infer(save_dir, inputs):
   """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
@@ -215,7 +251,7 @@ def _import_and_infer(save_dir, inputs):
 class ModelSaveTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  @test_util.run_v2_only
   def test_model_save(self):
     input_dim = 5
     model = testing_utils.get_small_mlp(10, 3, input_dim)
@@ -232,8 +268,9 @@ class ModelSaveTest(keras_parameterized.TestCase):
         _import_and_infer(save_dir, {model.input_names[0]: np.ones((8, 5))}))
 
 
-class ExtractModelMetricsTest(test.TestCase):
+class ExtractModelMetricsTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_all_keras_modes
   def test_extract_model_metrics(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
@@ -266,7 +303,7 @@ class ExtractModelMetricsTest(test.TestCase):
             keras.metrics.mean_squared_error
         ],
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01),
-        run_eagerly=None)
+        run_eagerly=testing_utils.should_run_eagerly())
     extract_metrics = saving_utils.extract_model_metrics(model)
     self.assertEqual(set(model_metric_names), set(model.metrics_names))
     self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
index c1f24e4a7b8..ad4669e9ca5 100644
--- a/tensorflow/python/keras/utils/composite_tensor_support_test.py
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -19,12 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+import scipy.sparse
 
+from tensorflow.python import keras
+
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import Layer
 from tensorflow.python.ops import array_ops
@@ -79,12 +84,74 @@ class ToSparse(Layer):
     return sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
 
 
+class _SubclassModel(keras.Model):
+  """A Keras subclass model."""
+
+  def __init__(self, layers, i_layer=None):
+    super(_SubclassModel, self).__init__()
+    # Note that clone and build doesn't support lists of layers in subclassed
+    # models. Adding each layer directly here.
+    for i, layer in enumerate(layers):
+      setattr(self, self._layer_name_for_i(i), layer)
+    self.num_layers = len(layers)
+    if i_layer:
+      self._set_inputs(i_layer)
+
+  def _layer_name_for_i(self, i):
+    return "layer{}".format(i)
+
+  def call(self, inputs, **kwargs):
+    x = inputs
+    for i in range(self.num_layers):
+      layer = getattr(self, self._layer_name_for_i(i))
+      x = layer(x)
+    return x
+
+
+def get_model_from_layers_with_input(layers,
+                                     input_shape=None,
+                                     input_dtype=None,
+                                     model_input=None):
+  """Builds a model from a sequence of layers."""
+  if model_input is not None and input_shape is not None:
+    raise ValueError("Cannot specify a model_input and an input shape.")
+
+  model_type = testing_utils.get_model_type()
+  if model_type == "subclass":
+    return _SubclassModel(layers, model_input)
+
+  if model_type == "sequential":
+    model = keras.models.Sequential()
+    if model_input is not None:
+      model.add(model_input)
+    elif input_shape is not None:
+      model.add(keras.Input(shape=input_shape, dtype=input_dtype))
+    for layer in layers:
+      model.add(layer)
+    return model
+
+  if model_type == "functional":
+    if model_input:
+      inputs = model_input
+    else:
+      if not input_shape:
+        raise ValueError("Cannot create a functional model from layers with no "
+                         "input shape.")
+      inputs = keras.Input(shape=input_shape, dtype=input_dtype)
+    outputs = inputs
+    for layer in layers:
+      outputs = layer(outputs)
+    return keras.Model(inputs, outputs)
+
+  raise ValueError("Unknown model type {}".format(model_type))
+
+
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
-class InternalCompositeTest(keras_parameterized.TestCase,
-                            ragged_test_util.RaggedTensorTestCase):
+class CompositeTensorInternalTest(keras_parameterized.TestCase,
+                                  ragged_test_util.RaggedTensorTestCase):
 
-  def test_model_with_internal_ragged_tensors(self):
+  def test_internal_ragged_tensors(self):
     # Create a model that accepts an input, converts it to Ragged, and
     # converts the ragged tensor back to a dense tensor.
     layers = [ToRagged(padding=0), ToDense(default_value=-1)]
@@ -96,7 +163,7 @@ class InternalCompositeTest(keras_parameterized.TestCase,
     output = model.predict(input_data)
     self.assertAllEqual(expected_output, output)
 
-  def test_model_with_internal_sparse_tensors(self):
+  def test_internal_sparse_tensors(self):
     # Create a model that accepts an input, converts it to Sparse, and
     # converts the sparse tensor back to a dense tensor.
     layers = [ToSparse(), ToDense(default_value=-1)]
@@ -108,7 +175,7 @@ class InternalCompositeTest(keras_parameterized.TestCase,
     output = model.predict(input_data)
     self.assertAllEqual(expected_output, output)
 
-  def test_training_model_with_internal_ragged_tensors(self):
+  def test_training_internal_ragged_tensors(self):
 
     # Create a model that implements y=Mx. This is easy to learn and will
     # demonstrate appropriate gradient passing. (We have to use RaggedTensors
@@ -130,7 +197,13 @@ class InternalCompositeTest(keras_parameterized.TestCase,
     # than the one stored at history[-1].
     self.assertNotEqual(history.history["loss"][-1], history.history["loss"][0])
 
-  def test_model_with_ragged_tensor_outputs(self):
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class CompositeTensorOutputTest(keras_parameterized.TestCase,
+                                ragged_test_util.RaggedTensorTestCase):
+
+  def test_ragged_tensor_outputs(self):
     # Create a model that accepts an input, converts it to Ragged, and
     # converts the ragged tensor back to a dense tensor.
     layers = [ToRagged(padding=0)]
@@ -143,7 +216,7 @@ class InternalCompositeTest(keras_parameterized.TestCase,
     expected_values = [[1], [2, 3]]
     self.assertRaggedEqual(expected_values, output)
 
-  def test_model_with_ragged_tensor_rebatched_outputs(self):
+  def test_ragged_tensor_rebatched_outputs(self):
     # Create a model that accepts an input, converts it to Ragged, and
     # converts the ragged tensor back to a dense tensor.
     layers = [ToRagged(padding=0)]
@@ -156,7 +229,7 @@ class InternalCompositeTest(keras_parameterized.TestCase,
     expected_values = [[1], [2, 3], [4], [5, 6]]
     self.assertRaggedEqual(expected_values, output)
 
-  def test_model_with_sparse_tensor_outputs(self):
+  def test_sparse_tensor_outputs(self):
     # Create a model that accepts an input, converts it to Ragged, and
     # converts the ragged tensor back to a dense tensor.
     layers = [ToSparse()]
@@ -174,7 +247,7 @@ class InternalCompositeTest(keras_parameterized.TestCase,
     self.assertAllEqual(output.values, expected_values)
     self.assertAllEqual(output.dense_shape, expected_dense_shape)
 
-  def test_model_with_sparse_tensor_rebatched_outputs(self):
+  def test_sparse_tensor_rebatched_outputs(self):
     # Create a model that accepts an input, converts it to Ragged, and
     # converts the ragged tensor back to a dense tensor.
     layers = [ToSparse()]
@@ -194,5 +267,406 @@ class InternalCompositeTest(keras_parameterized.TestCase,
     self.assertAllEqual(output.dense_shape, expected_dense_shape)
 
 
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class SparseTensorInputTest(keras_parameterized.TestCase,
+                            ragged_test_util.RaggedTensorTestCase):
+
+  def test_sparse_scipy_predict_inputs_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor. Scipy sparse matrices are limited to 2D, so use
+    # a one-dimensional shape.
+    model_input = input_layer.Input(shape=(3,), sparse=True)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+
+    input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
+                                         shape=[2, 3])
+    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+    output = model.predict(input_data, steps=1)
+    self.assertAllEqual(expected_output, output)
+
+    input_data_2 = scipy.sparse.coo_matrix(
+        ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3])
+    expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
+    output_2 = model.predict(input_data_2, steps=1)
+    self.assertAllEqual(expected_output_2, output_2)
+
+  def test_sparse_scipy_eval_inputs(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor. Scipy sparse matrices are limited to 2D, so use
+    # a one-dimensional shape.
+    model_input = input_layer.Input(shape=(3,), sparse=True)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+
+    input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
+                                         shape=[2, 3])
+    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+
+    output = model.evaluate(input_data, expected_output, steps=1)
+    self.assertAllEqual(1.0, output[-1])
+
+    input_data_2 = scipy.sparse.coo_matrix(
+        ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3])
+    expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
+    output_2 = model.evaluate(input_data_2, expected_output_2, steps=1)
+    self.assertAllEqual(1.0, output_2[-1])
+
+  def test_sparse_scipy_predict_input_dicts_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor. Scipy sparse matrices are limited to 2D, so use
+    # a one-dimensional shape.
+    if testing_utils.get_model_type() == "subclass":
+      input_name = "input_1"  # Subclass models don"t support input names.
+    else:
+      input_name = "test_input_name"
+    model_input = input_layer.Input(shape=(3,), sparse=True, name=input_name)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+
+    input_data = {
+        input_name:
+            scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
+                                    shape=[2, 3])
+    }
+    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+    output = model.predict(input_data, steps=1)
+    self.assertAllEqual(expected_output, output)
+
+    input_data_2 = {
+        input_name:
+            scipy.sparse.coo_matrix(
+                ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3])
+    }
+    expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
+    output_2 = model.predict(input_data_2, steps=1)
+    self.assertAllEqual(expected_output_2, output_2)
+
+  def test_sparse_scipy_eval_input_dicts(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor. Scipy sparse matrices are limited to 2D, so use
+    # a one-dimensional shape.
+    if testing_utils.get_model_type() == "subclass":
+      input_name = "input_1"  # Subclass models don"t support input names.
+    else:
+      input_name = "test_input_name"
+    model_input = input_layer.Input(shape=(3,), sparse=True, name=input_name)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+
+    input_data = {
+        input_name:
+            scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
+                                    shape=[2, 3])
+    }
+    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+    output = model.evaluate(input_data, expected_output, steps=1)
+    self.assertAllEqual(1.0, output[-1])
+
+    input_data_2 = {
+        input_name:
+            scipy.sparse.coo_matrix(
+                ([5, 6, 7, 8], ([0, 1, 1, 2], [0, 0, 1, 1])), shape=[3, 3])
+    }
+    expected_output_2 = np.array([[5, -1, -1], [6, 7, -1], [-1, 8, -1]])
+    output_2 = model.evaluate(input_data_2, expected_output_2, steps=1)
+    self.assertAllEqual(1.0, output_2[-1])
+
+  def test_sparse_tensor_eval_inputs(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    model_input = input_layer.Input(shape=(1, None), sparse=True)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+
+    # Define some input data.
+    input_data = sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
+                                            [1, 2, 3], [2, 1, 3])
+    expected_output = np.array([[[1, -1, -1]], [[2, 3, -1]]])
+    output = model.evaluate(input_data, expected_output, steps=1)
+    self.assertAllEqual(1.0, output[-1])
+
+    input_data_2 = sparse_tensor.SparseTensor(
+        [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8], [3, 1, 4])
+    expected_output_2 = np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]],
+                                  [[-1, 8, -1, -1]]])
+    output_2 = model.evaluate(input_data_2, expected_output_2, steps=1)
+    self.assertAllEqual(1.0, output_2[-1])
+
+  def test_sparse_tensor_predict_inputs_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    model_input = input_layer.Input(shape=(1, None), sparse=True)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+
+    # Define some input data.
+    input_data = sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
+                                            [1, 2, 3], [2, 1, 3])
+    expected_output = np.array([[[1, -1, -1]], [[2, 3, -1]]])
+    output = model.predict(input_data, steps=1)
+    self.assertAllEqual(expected_output, output)
+
+    input_data_2 = sparse_tensor.SparseTensor(
+        [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8], [3, 1, 4])
+    expected_output_2 = np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]],
+                                  [[-1, 8, -1, -1]]])
+    output_2 = model.predict(input_data_2, steps=1)
+    self.assertAllEqual(expected_output_2, output_2)
+
+  def test_sparse_tensor_predict_input_dicts_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    if testing_utils.get_model_type() == "subclass":
+      input_name = "input_1"  # Subclass models don"t support  input names.
+    else:
+      input_name = "test_input_name"
+    model_input = input_layer.Input(
+        shape=(1, None), sparse=True, name=input_name)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+
+    # Define some input data.
+    input_data = {
+        input_name:
+            sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
+                                       [1, 2, 3], [2, 1, 3])
+    }
+    expected_output = np.array([[[1, -1, -1]], [[2, 3, -1]]])
+    output = model.predict(input_data, steps=1)
+    self.assertAllEqual(expected_output, output)
+
+    input_data_2 = {
+        input_name:
+            sparse_tensor.SparseTensor(
+                [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8],
+                [3, 1, 4])
+    }
+    expected_output_2 = np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]],
+                                  [[-1, 8, -1, -1]]])
+    output_2 = model.predict(input_data_2, steps=1)
+    self.assertAllEqual(expected_output_2, output_2)
+
+  def test_sparse_tensor_eval_input_dicts_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    if testing_utils.get_model_type() == "subclass":
+      input_name = "input_1"  # Subclass models don"t support  input names.
+    else:
+      input_name = "test_input_name"
+    model_input = input_layer.Input(
+        shape=(1, None), sparse=True, name=input_name)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+
+    # Define some input data.
+    input_data = {
+        input_name:
+            sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
+                                       [1, 2, 3], [2, 1, 3])
+    }
+    expected_output = np.array([[[1, -1, -1]], [[2, 3, -1]]])
+    output = model.evaluate(input_data, expected_output, steps=1)
+    self.assertAllEqual(1.0, output[-1])
+
+    input_data_2 = {
+        input_name:
+            sparse_tensor.SparseTensor(
+                [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8],
+                [3, 1, 4])
+    }
+    expected_output_2 = np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]],
+                                  [[-1, 8, -1, -1]]])
+    output_2 = model.evaluate(input_data_2, expected_output_2, steps=1)
+    self.assertAllEqual(1.0, output_2[-1])
+
+  def test_sparse_tensor_dataset_predict_inputs_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    model_input = input_layer.Input(shape=(1, None), sparse=True)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+
+    # Define some input data.
+    input_data = dataset_ops.Dataset.from_tensors(
+        sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]], [1, 2, 3],
+                                   [2, 1, 3]))
+    expected_output = np.array([[[1, -1, -1]], [[2, 3, -1]]])
+    output = model.predict(input_data)
+    self.assertAllEqual(expected_output, output)
+
+    input_data_2 = dataset_ops.Dataset.from_tensors(
+        sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]],
+                                   [5, 6, 7, 8], [3, 1, 4]))
+    expected_output_2 = np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]],
+                                  [[-1, 8, -1, -1]]])
+    output_2 = model.predict(input_data_2)
+    self.assertAllEqual(expected_output_2, output_2)
+
+  def test_sparse_tensor_dataset_eval_inputs_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    model_input = input_layer.Input(shape=(1, None), sparse=True)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+
+    # Define some input data.
+    input_tensor = sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
+                                              [1, 2, 3], [2, 1, 3])
+    expected_output = np.array([[[1, -1, -1]], [[2, 3, -1]]])
+    input_data = dataset_ops.Dataset.from_tensors(
+        (input_tensor, expected_output))
+    output = model.evaluate(input_data)
+    self.assertAllEqual(1.0, output[-1])
+
+    input_tensor_2 = sparse_tensor.SparseTensor(
+        [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8], [3, 1, 4])
+    expected_output_2 = np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]],
+                                  [[-1, 8, -1, -1]]])
+    input_data_2 = dataset_ops.Dataset.from_tensors(
+        (input_tensor_2, expected_output_2))
+    output_2 = model.evaluate(input_data_2)
+    self.assertAllEqual(1.0, output_2[-1])
+
+  def test_sparse_tensor_dataset_dict_predict_inputs_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    if testing_utils.get_model_type() == "subclass":
+      input_name = "input_1"  # Subclass models don"t support custom input names
+    else:
+      input_name = "test_input_name"
+    model_input = input_layer.Input(
+        shape=(1, None), sparse=True, name=input_name)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+
+    # Define some input data.
+    input_data = dataset_ops.Dataset.from_tensors({
+        input_name:
+            sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
+                                       [1, 2, 3], [2, 1, 3])
+    })
+    expected_output = np.array([[[1, -1, -1]], [[2, 3, -1]]])
+    output = model.predict(input_data)
+    self.assertAllEqual(expected_output, output)
+
+    input_data_2 = dataset_ops.Dataset.from_tensors({
+        input_name:
+            sparse_tensor.SparseTensor(
+                [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8],
+                [3, 1, 4])
+    })
+    expected_output_2 = np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]],
+                                  [[-1, 8, -1, -1]]])
+    output_2 = model.predict(input_data_2)
+    self.assertAllEqual(expected_output_2, output_2)
+
+  def test_sparse_tensor_dataset_dict_eval_inputs_via_input_layer_args(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    if testing_utils.get_model_type() == "subclass":
+      input_name = "input_1"  # Subclass models don"t support custom input names
+    else:
+      input_name = "test_input_name"
+    model_input = input_layer.Input(
+        shape=(1, None), sparse=True, name=input_name)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+    model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"])
+
+    # Define some input data.
+    input_tensor = {
+        input_name:
+            sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
+                                       [1, 2, 3], [2, 1, 3])
+    }
+    expected_output = np.array([[[1, -1, -1]], [[2, 3, -1]]])
+    input_data = dataset_ops.Dataset.from_tensors(
+        (input_tensor, expected_output))
+    output = model.evaluate(input_data)
+    self.assertAllEqual(1.0, output[-1])
+
+    input_tensor_2 = {
+        input_name:
+            sparse_tensor.SparseTensor(
+                [[0, 0, 0], [1, 0, 0], [1, 0, 1], [2, 0, 1]], [5, 6, 7, 8],
+                [3, 1, 4])
+    }
+    expected_output_2 = np.array([[[5, -1, -1, -1]], [[6, 7, -1, -1]],
+                                  [[-1, 8, -1, -1]]])
+    input_data_2 = dataset_ops.Dataset.from_tensors(
+        (input_tensor_2, expected_output_2))
+    output_2 = model.evaluate(input_data_2)
+    self.assertAllEqual(1.0, output_2[-1])
+
+
+# CompositeTensor shape validation only happens in non-eager modes and in non-
+# subclassed models, so we run a separate parameterized test for them.
+@keras_parameterized.run_with_all_model_types(exclude_models=["subclass"])
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class SparseTensorInputValidationTest(keras_parameterized.TestCase,
+                                      ragged_test_util.RaggedTensorTestCase):
+
+  def test_sparse_scipy_input_checks_shape(self):
+    model_input = input_layer.Input(shape=(3,), sparse=True)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+
+    input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
+                                         shape=[2, 4])
+    with self.assertRaisesRegex(ValueError, ".*got array with shape.*"):
+      _ = model.predict(input_data, steps=1)
+
+  def test_sparse_tensor_input_checks_shapes(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    model_input = input_layer.Input(shape=(2, None), sparse=True)
+    layers = [ToDense(default_value=-1)]
+    model = get_model_from_layers_with_input(layers, model_input=model_input)
+
+    # Define some input data.
+    input_data = sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
+                                            [1, 2, 3], [2, 1, 3])
+    with self.assertRaisesRegex(ValueError, ".*got array with shape.*"):
+      _ = model.predict(input_data, steps=1)
+
+
+@keras_parameterized.run_with_all_model_types(
+    exclude_models=["functional", "sequential"])
+@keras_parameterized.run_all_keras_modes
+class SubclassCompositeTensorInputsTest(keras_parameterized.TestCase,
+                                        ragged_test_util.RaggedTensorTestCase):
+
+  def test_subclass_implicit_sparse_inputs_fails(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    layers = [ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers)
+
+    # Define some input data.
+    input_data = sparse_tensor.SparseTensor([[0, 0], [1, 0], [1, 1]], [1, 2, 3],
+                                            [2, 3])
+    with self.assertRaisesRegex(ValueError, ".*implicitly derived inputs.*"):
+      _ = model.predict(input_data, steps=1)
+
+  def test_subclass_implicit_sparse_scipy_inputs_fails(self):
+    # Create a model that accepts a sparse input and converts the sparse tensor
+    # back to a dense tensor.
+    layers = [ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers)
+
+    # Define some input data.
+    input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
+                                         shape=[2, 3])
+    with self.assertRaisesRegex(ValueError, ".*either a single array.*"):
+      _ = model.predict(input_data, steps=1)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index ce1eb3f1dcf..d1f942cab39 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -72,7 +72,7 @@ def update_state_wrapper(update_state_fn):
     with tf_utils.graph_context_for_symbolic_tensors(*args, **kwargs):
       update_op = update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
-      metric_obj.add_update(update_op, inputs=True)
+      metric_obj.add_update(update_op)
     return update_op
 
   return tf_decorator.make_decorator(update_state_fn, decorated)
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index beed7ceef67..2b0b0e04638 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -108,7 +108,8 @@ def get_reachable_from_inputs(inputs, targets=None):
   """
   inputs = nest.flatten(inputs)
   reachable = set(inputs)
-  if targets:
+  if targets and not isinstance(targets, set):
+    targets = nest.flatten(targets)
     targets = set(targets)
   queue = inputs[:]
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index a873a472479..05dd424c649 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "sycl_py_test")
@@ -237,7 +236,7 @@ cuda_py_test(
     ],
     shard_count = 5,
     tags = [
-        "no_windows_gpu",
+        "no_gpu",  # TODO(b/131773093): Re-enable.
         "nomsan",  # TODO(b/131773093): Re-enable.
     ],
     xla_enable_strict_auto_jit = True,
@@ -546,6 +545,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -599,7 +599,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "logging_ops_test",
     size = "small",
     srcs = ["logging_ops_test.py"],
@@ -612,6 +612,7 @@ tf_py_test(
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1270,6 +1271,7 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
     ],
     shard_count = 10,
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -2533,7 +2535,10 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_windows",
+        "no_windows_gpu",
+    ],
     xla_enable_strict_auto_jit = True,
 )
 
@@ -3823,4 +3828,5 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
     ],
     shard_count = 10,
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index a4e3f644af8..542833151ef 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1086,6 +1086,15 @@ class StridedSliceAssignChecker(object):
 
 class SliceAssignTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
+  def testInvalidSlice(self):
+    with self.cached_session() as sess:
+      foo = constant_op.constant([1, 2, 3])
+      with self.assertRaisesRegexp(ValueError, "Sliced assignment"
+                                   " is only supported for variables"):
+        bar = foo[:2].assign(constant_op.constant([1, 2]))
+        sess.run(bar)
+
   def doTestSliceAssign(self, use_resource):
     for dtype in STRIDED_SLICE_TYPES:
       checker = StridedSliceAssignChecker(
diff --git a/tensorflow/python/kernel_tests/boosted_trees/BUILD b/tensorflow/python/kernel_tests/boosted_trees/BUILD
index d23795eb3ad..e75953355cf 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/BUILD
+++ b/tensorflow/python/kernel_tests/boosted_trees/BUILD
@@ -3,10 +3,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 filegroup(
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index ae1181803b7..a2582b83f46 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -1075,20 +1075,20 @@ class ControlFlowTest(test.TestCase):
       fn1 = lambda: array_ops.identity(v1)
       fn2 = lambda: array_ops.gather(v1, [1, 1])
       r = control_flow_ops.cond(pred, fn1, fn2)
+      # The following `grad` is a Tensor since it is the aggregation of an
+      # IndexedSlice and a Tensor. It is an `IndexedSlices` with control flow
+      # v2.
       grad = gradients_impl.gradients(r, [v1])[0]
       self.evaluate(variables.global_variables_initializer())
-      # Should just be [1, 1], but possibly a sparse representation
-      gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 1})
-      dense_gv = [
-          sum(y for (x, y) in zip(gi, gv) if x == i) for i in range(2)
-      ]
-      self.assertAllEqual(dense_gv, [1.0, 1.0])
-      # Should be [0, 2], as the else forwards v1[1] twice
-      gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 3})
-      dense_gv = [
-          sum(y for (x, y) in zip(gi, gv) if x == i) for i in range(2)
-      ]
-      self.assertAllEqual(dense_gv, [0.0, 2.0])
+
+      if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        self.assertIsInstance(grad, ops.IndexedSlices)
+
+      grad_value = sess.run(grad, feed_dict={c: 1})
+      self.assertAllEqual(gradient_checker_v2._to_numpy(grad_value), [1.0, 1.0])
+
+      grad_value = sess.run(grad, feed_dict={c: 3})
+      self.assertAllEqual(gradient_checker_v2._to_numpy(grad_value), [0.0, 2.0])
 
   @test_util.run_deprecated_v1
   def testCondGrad_ResourceVarSparseRead(self):
@@ -2846,7 +2846,6 @@ class ControlFlowTest(test.TestCase):
     var = resource_variable_ops.ResourceVariable([1., 2., 3., 4.])
     self.evaluate(variables.global_variables_initializer())
     grad = self.evaluate(bar(var))
-    self.assertIsInstance(grad, ops.IndexedSlicesValue)
     self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 2., 0., 2.])
 
   def testWhileGrad_ResourceVarInNestedFunctionCall(self):
@@ -2870,7 +2869,6 @@ class ControlFlowTest(test.TestCase):
     var = resource_variable_ops.ResourceVariable([1., 1., 1., 1.])
     self.evaluate(variables.global_variables_initializer())
     grad = self.evaluate(bar(var))
-    self.assertIsInstance(grad, ops.IndexedSlicesValue)
     self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 2., 0., 2.])
 
   def testWhileGrad_ResourceVarInLoopInFunctionCall(self):
@@ -2896,7 +2894,6 @@ class ControlFlowTest(test.TestCase):
     var = resource_variable_ops.ResourceVariable([1., 1., 1., 1.])
     self.evaluate(variables.global_variables_initializer())
     grad = self.evaluate(bar(var))
-    self.assertIsInstance(grad, ops.IndexedSlicesValue)
     self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 6., 6., 0.])
 
   def testWhileCondGrad_ResourceVarInFunctionCall(self):
@@ -2926,8 +2923,8 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testWhileGrad_ResourceVarSparseRead(self):
-    # NOTE(skyewm): this test is interesting because the
-    # ResourceVariable.sparse_read gradient function returns an IndexedSlices.
+    # NOTE(skyewm): this test is interesting because the gradient is the
+    # aggregation result of IndexedSlices and Tensors.
     var = resource_variable_ops.ResourceVariable(np.ones(5),
                                                  dtype=dtypes.float32)
     r = control_flow_ops.while_loop(
@@ -2938,14 +2935,13 @@ class ControlFlowTest(test.TestCase):
 
     self.evaluate(variables.global_variables_initializer())
     grad_val = self.evaluate(grad)
-    self.assertIsInstance(grad_val, ops.IndexedSlicesValue)
     arr = gradient_checker_v2._to_numpy(grad_val)
     self.assertAllEqual(arr, [0., 12., 0., 12., 0.])
 
   @test_util.run_deprecated_v1
   def testWhileGrad_MultiResourceVarSparseRead(self):
-    # NOTE(skyewm): this test is interesting because the
-    # ResourceVariable.sparse_read gradient function returns an IndexedSlices.
+    # NOTE(skyewm): this test is interesting because the gradient is the
+    # aggregation result of IndexedSlices and Tensors.
     var1 = resource_variable_ops.ResourceVariable(np.ones(5),
                                                   dtype=dtypes.float32)
     var2 = resource_variable_ops.ResourceVariable(np.ones(3),
@@ -2968,8 +2964,6 @@ class ControlFlowTest(test.TestCase):
     self.evaluate(variables.global_variables_initializer())
     var1_grad_val = self.evaluate(var1_grad)
     var2_grad_val = self.evaluate(var2_grad)
-    self.assertIsInstance(var1_grad_val, ops.IndexedSlicesValue)
-    self.assertIsInstance(var2_grad_val, ops.IndexedSlicesValue)
     self.assertAllEqual(gradient_checker_v2._to_numpy(var1_grad_val),
                         [0., 1., 0., 1., 0.])
     self.assertAllEqual(gradient_checker_v2._to_numpy(var2_grad_val),
@@ -4337,6 +4331,34 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(st1.values, st3.values)
     self.assertAllEqual(st1.dense_shape, st3.dense_shape)
 
+  def _buildWhileWithShapeInvariants(self, shape_invariants):
+    r = constant_op.constant([1, 2])
+
+    def cond(_):
+      return False
+
+    def body(_):
+      return constant_op.constant([1])
+
+    return control_flow_ops.while_loop(
+        cond, body, [r], shape_invariants=shape_invariants)
+
+  def testWhileOutputShapeWithShapeInvariantsUnknownRank(self):
+    @def_function.function
+    def runTest():
+      while_output = self._buildWhileWithShapeInvariants(
+          [tensor_shape.TensorShape(None)])
+      self.assertIsNone(while_output.shape.rank)
+    runTest()
+
+  def testWhileOutputShapeWithShapeInvariantsPartialShape(self):
+    @def_function.function
+    def runTest():
+      while_output = self._buildWhileWithShapeInvariants(
+          [tensor_shape.TensorShape([None])])
+      self.assertAllEqual(while_output.shape.as_list(), [None])
+    runTest()
+
 
 class ControlFlowContextCheckTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
index cfb35cde410..a2feb67dc73 100644
--- a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
@@ -35,7 +35,12 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
-      for padding in ["SAME", "VALID"]:
+      for padding in [
+          "SAME",
+          "VALID",
+          [(0, 0), (1, 2), (3, 4), (0, 0)],
+          [(0, 0), (0, 3), (4, 2), (0, 0)]
+      ]:
         for stride in [1, 2]:
           np.random.seed(1)
           in_shape = [5, 8, 6, 4]
@@ -70,7 +75,12 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
   def testGradientDilatedConv(self):
     if test.is_gpu_available(cuda_only=True):
       with self.session(use_gpu=True):
-        for padding in ["SAME", "VALID"]:
+        for padding in [
+            "SAME",
+            "VALID",
+            [(0, 0), (3, 5), (2, 1), (0, 0)],
+            [(0, 0), (5, 2), (5, 1), (0, 0)]
+        ]:
           for stride in [1, 2]:
             np.random.seed(1)
             in_shape = [5, 8, 6, 4]
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 1cb9cfa7479..39ae0ac0317 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -318,28 +318,38 @@ class LogicalOpTest(test.TestCase):
 
 class SelectOpTest(test.TestCase):
 
-  def _compare(self, c, x, y, use_gpu):
+  def _compare(self, fn, c, x, y, use_gpu):
     np_ans = np.where(c, x, y)
     with test_util.device(use_gpu=use_gpu):
-      out = array_ops.where(c, x, y)
+      out = fn(c, x, y)
       tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
-  def _compareGradientX(self, c, x, y, numeric_gradient_type=None):
+  def _compareGradientX(self,
+                        fn,
+                        c,
+                        x,
+                        y,
+                        numeric_gradient_type=None,
+                        x_init_value=None):
     with self.cached_session():
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
-      out = array_ops.where(c, inx, iny)
+      out = fn(c, inx, iny)
       s = list(np.shape(c))
+      if x_init_value is None:
+        x_init_value = x
+      if x.shape != y.shape:
+        x_init_value = np.broadcast_to(y, x.shape)
       jacob_t, jacob_n = gradient_checker.compute_gradient(
-          inx, s, out, s, x_init_value=x)
+          inx, s, out, s, x_init_value=x_init_value)
       if numeric_gradient_type is not None:
         xf = x.astype(numeric_gradient_type)
         yf = y.astype(numeric_gradient_type)
         inxf = ops.convert_to_tensor(xf)
         inyf = ops.convert_to_tensor(yf)
-        outf = array_ops.where(c, inxf, inyf)
+        outf = fn(c, inxf, inyf)
         _, jacob_n = gradient_checker.compute_gradient(
             inxf, s, outf, s, x_init_value=xf)
         jacob_n = jacob_n.astype(x.dtype)
@@ -350,20 +360,20 @@ class SelectOpTest(test.TestCase):
     elif x.dtype == np.float64:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
-  def _compareGradientY(self, c, x, y, numeric_gradient_type=None):
+  def _compareGradientY(self, fn, c, x, y, numeric_gradient_type=None):
     with self.cached_session():
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
-      out = array_ops.where(c, inx, iny)
+      out = fn(c, inx, iny)
       s = list(np.shape(c))
       jacob_t, jacob_n = gradient_checker.compute_gradient(
-          iny, s, out, s, x_init_value=y, delta=1.0)
+          iny, s, out, s, x_init_value=x, delta=1.0)
       if numeric_gradient_type is not None:
         xf = x.astype(numeric_gradient_type)
         yf = y.astype(numeric_gradient_type)
         inxf = ops.convert_to_tensor(xf)
         inyf = ops.convert_to_tensor(yf)
-        outf = array_ops.where(c, inxf, inyf)
+        outf = fn(c, inxf, inyf)
         _, jacob_n = gradient_checker.compute_gradient(
             inyf, s, outf, s, x_init_value=yf)
         jacob_n = jacob_n.astype(x.dtype)
@@ -374,7 +384,7 @@ class SelectOpTest(test.TestCase):
     elif x.dtype == np.float64:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
-  def testScalar(self):
+  def _testScalar(self, fn):
     c = True
     x = np.random.rand(1, 3, 2) * 100
     y = np.random.rand(1, 3, 2) * 100
@@ -384,11 +394,58 @@ class SelectOpTest(test.TestCase):
     ]:
       xt = x.astype(t)
       yt = y.astype(t)
-      self._compare(c, xt, yt, use_gpu=False)
+      self._compare(fn, c, xt, yt, use_gpu=False)
       if t in [np.float16, np.float32, np.float64]:
-        self._compare(c, xt, yt, use_gpu=True)
+        self._compare(fn, c, xt, yt, use_gpu=True)
 
-  def testBasic(self):
+  def testScalar(self):
+    self._testScalar(array_ops.where)
+    self._testScalar(array_ops.where_v2)
+
+  def _testScalarBroadcast(self, fn, c, x, y):
+    for t in [
+        np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
+        np.complex128
+    ]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      self._compare(fn, c, xt, yt, use_gpu=False)
+      if t in [np.float16, np.float32, np.float64]:
+        self._compare(fn, c, xt, yt, use_gpu=True)
+
+  def testScalarBroadcast(self):
+    c = True
+    # where_v2 only
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 1, 1) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 3, 1) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 1, 2) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 1) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 2) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(3, 2) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+
+  def _testBasic(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
     y = np.random.rand(1, 3, 2) * 100
@@ -398,12 +455,62 @@ class SelectOpTest(test.TestCase):
     ]:
       xt = x.astype(t)
       yt = y.astype(t)
-      self._compare(c, xt, yt, use_gpu=False)
+      self._compare(fn, c, xt, yt, use_gpu=False)
       if t in [np.float16, np.float32, np.float64]:
-        self._compare(c, xt, yt, use_gpu=True)
+        self._compare(fn, c, xt, yt, use_gpu=True)
 
-  @test_util.run_deprecated_v1
-  def testGradients(self):
+  def testBasic(self):
+    self._testBasic(array_ops.where)
+    self._testBasic(array_ops.where_v2)
+
+  def _testBasicBroadcast(self, fn, c, x, y):
+    for t in [
+        np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
+        np.complex128
+    ]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      self._compare(fn, c, xt, yt, use_gpu=False)
+      if t in [np.float16, np.float32, np.float64]:
+        self._compare(fn, c, xt, yt, use_gpu=True)
+
+  def testBasicBroadcast(self):
+    c0 = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    c1 = np.random.randint(0, 2, 2).astype(np.bool).reshape(1, 1, 2)
+    c2 = np.random.randint(0, 2, 3).astype(np.bool).reshape(1, 3, 1)
+    c3 = np.random.randint(0, 2, 1).astype(np.bool).reshape(1, 1, 1)
+    for c in [c0, c1, c2, c3]:
+      # where_v2 only
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1, 1) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 3, 1) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1, 2) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 2) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(3, 2) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+
+  def _testGradients(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
     y = np.random.rand(1, 3, 2) * 100
@@ -416,14 +523,45 @@ class SelectOpTest(test.TestCase):
         # care is taken with choosing the inputs and the delta. This is
         # a weaker check (in particular, it does not test the op itself,
         # only its gradient), but it's much better than nothing.
-        self._compareGradientX(c, xt, yt, np.float)
-        self._compareGradientY(c, xt, yt, np.float)
+        self._compareGradientX(fn, c, xt, yt, np.float)
+        self._compareGradientY(fn, c, xt, yt, np.float)
       else:
-        self._compareGradientX(c, xt, yt)
-        self._compareGradientY(c, xt, yt)
+        self._compareGradientX(fn, c, xt, yt)
+        self._compareGradientY(fn, c, xt, yt)
 
   @test_util.run_deprecated_v1
-  def testShapeMismatch(self):
+  def testGradients(self):
+    self._testGradients(array_ops.where)
+    self._testGradients(array_ops.where_v2)
+
+  @test_util.run_deprecated_v1
+  def testGradientsBroadcast(self):
+    c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    for t in [np.float32, np.float64]:
+      # where_v2 only
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1, 1) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 3, 1) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1, 2) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 2) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(3, 2) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+
+  def _testShapeMismatch(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
     y = np.random.rand(2, 5, 3) * 100
@@ -434,10 +572,14 @@ class SelectOpTest(test.TestCase):
       xt = x.astype(t)
       yt = y.astype(t)
       with self.assertRaises(ValueError):
-        array_ops.where(c, xt, yt)
+        fn(c, xt, yt)
 
   @test_util.run_deprecated_v1
-  def testEmptyTensor(self):
+  def testShapeMismatch(self):
+    self._testShapeMismatch(array_ops.where)
+    self._testShapeMismatch(array_ops.where_v2)
+
+  def _testEmptyTensor(self, fn):
     c = np.random.randint(0, 3, 0).astype(np.bool).reshape(1, 3, 0)
     x = np.random.rand(1, 3, 0) * 100
     y = np.random.rand(1, 3, 0) * 100
@@ -445,20 +587,29 @@ class SelectOpTest(test.TestCase):
     with self.cached_session():
       xt = x.astype(np.float32)
       yt = y.astype(np.float32)
-      z = array_ops.where(c, xt, yt).eval()
+      z = fn(c, xt, yt).eval()
       self.assertAllEqual(z_expected, z)
 
   @test_util.run_deprecated_v1
-  def testNan(self):
-    """Verify that nans don't propagate where they shouldn't."""
+  def testEmptyTensor(self):
+    self._testEmptyTensor(array_ops.where)
+    self._testEmptyTensor(array_ops.where_v2)
+
+  def _testNan(self, fn):
     with self.cached_session():
       for c in False, True:
         for a in 7.0, np.nan:
           for b in 5.0, np.nan:
-            x = array_ops.where(c, a, b).eval()
+            x = fn(c, a, b).eval()
             y = a if c else b
             self.assertEqual(np.isnan(x), np.isnan(y))
 
+  @test_util.run_deprecated_v1
+  def testNan(self):
+    """Verify that nans don't propagate where they shouldn't."""
+    self._testNan(array_ops.where)
+    self._testNan(array_ops.where_v2)
+
 
 class BatchSelectOpTest(test.TestCase):
   """Test broadcasting of Select when 'c' is a vec and 't' &'e' are rank2+."""
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 22c98201dd1..3add5e94ab8 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 cuda_py_test(
@@ -290,6 +289,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_oss"],  # b/133229436
     xla_enable_strict_auto_jit = True,
 )
 
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index f4044ed1e8c..f23b7d33664 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,6 +36,8 @@ from tensorflow.python.platform import test
 _TEST_TYPES = (dtypes.int64, dtypes.float32,
                dtypes.complex64, dtypes.complex128)
 
+# TODO(virimia): Add a benchmark for gather_v2, with batch_dims and axis set.
+
 
 class GatherTest(test.TestCase, parameterized.TestCase):
 
@@ -340,6 +343,12 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     result = array_ops.gather(params, indices, axis=axis, batch_dims=batch_dims)
     self.assertAllEqual(expected, result)
 
+    with compat.forward_compatibility_horizon(2019, 6, 11):
+      result = array_ops.gather(
+          params, indices, axis=axis, batch_dims=batch_dims)
+
+    self.assertAllEqual(expected, result)
+
   @parameterized.parameters([
       dict(
           params_shape=[2, 3, 4, 5, 6, 7],
@@ -434,6 +443,13 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(output_shape, result.shape.as_list())
     self.assertAllEqual(expected, result)
 
+    with compat.forward_compatibility_horizon(2019, 6, 11):
+      result = array_ops.gather(
+          params, indices, axis=axis, batch_dims=batch_dims)
+
+    self.assertAllEqual(output_shape, result.shape.as_list())
+    self.assertAllEqual(expected, result)
+
   def _batchNumpyGather(self, params, indices, axis, batch_dims):
     """Performs a batch gather by making recursive calls to np.take().
 
@@ -457,40 +473,6 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         for i in range(params.shape[0])
     ])
 
-  def testSkipEagerErrors(self):
-    if context.executing_eagerly():
-      return
-    with self.assertRaisesRegexp(ValueError, r"tf\.gather does not allow.*"):
-      array_ops.gather(
-          params=[1, 2],
-          batch_dims=1,
-          indices=array_ops.placeholder(dtypes.int32))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testErrors(self):
-
-    with self.assertRaisesRegexp(
-        ValueError, r"batch_dims = 2 must be less than rank\(indices\) = 2"):
-      array_ops.gather(
-          params=[[1, 2], [3, 4]], indices=[[1, 2], [3, 4]], batch_dims=2)
-
-    with self.assertRaisesRegexp(
-        ValueError, r"batch_dims = 1 must be less than rank\(params\) = 1"):
-      array_ops.gather(
-          params=[1, 2, 3, 4], indices=[[1, 2], [3, 4]], batch_dims=1)
-
-    with self.assertRaisesRegexp(
-        ValueError, r"batch_dims = 1 must be less than or equal to axis = 0"):
-      array_ops.gather(
-          params=[[1, 2], [3, 4]],
-          indices=[[1, 2], [3, 4]],
-          batch_dims=1,
-          axis=0)
-
-    one = array_ops.ones((), dtypes.int32)
-    with self.assertRaisesRegexp(TypeError, "batch_dims must be an int"):
-      array_ops.gather(params=[[1]], indices=[[1]], batch_dims=one)
-
   @test_util.run_v1_only("RefVariable is not supported in v2")
   def testGatherRefVariable(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/in_topk_op_test.py b/tensorflow/python/kernel_tests/in_topk_op_test.py
index ae1eb6e2e91..2ef233f8d68 100644
--- a/tensorflow/python/kernel_tests/in_topk_op_test.py
+++ b/tensorflow/python/kernel_tests/in_topk_op_test.py
@@ -37,12 +37,12 @@ class InTopKTest(test.TestCase):
 
   def testInTop1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
-    target = [3, 1]
+    target = [3, 2]
     self._validateInTopK(predictions, target, 1, [True, False])
 
   def testInTop2(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
-    target = [0, 2]
+    target = [2, 2]
     self._validateInTopK(predictions, target, 2, [False, True])
 
   def testInTop2Tie(self):
@@ -58,12 +58,12 @@ class InTopKTest(test.TestCase):
 
   def testInTopNan(self):
     predictions = [[0.1, float("nan"), 0.2, 0.4], [0.1, 0.2, 0.3, float("inf")]]
-    target = [0, 2]
+    target = [1, 3]
     self._validateInTopK(predictions, target, 2, [False, False])
 
   def testBadTarget(self):
     predictions = [[0.1, 0.3, 0.2, 0.2], [0.1, 0.3, 0.2, 0.2]]
-    target = [2, 12345]  # must return False for invalid target
+    target = [2, 4]  # must return False for invalid target
     self._validateInTopK(predictions, target, 2, [True, False])
 
   def testTensorK(self):
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 7ddf2b7830f..09a73d4e19a 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "sycl_py_test")
@@ -187,4 +186,5 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 4caecc85ca5..65d06a17968 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/kernel_tests/signal/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
index c3a288730ac..b5858e3ac08 100644
--- a/tensorflow/python/kernel_tests/signal/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -555,5 +555,56 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 self._tfIFFTForRank(rank), re, im, result_is_complex=False)
 
 
+class FFTShiftTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testDefinition(self):
+    with self.session():
+      x = [0, 1, 2, 3, 4, -4, -3, -2, -1]
+      y = [-4, -3, -2, -1, 0, 1, 2, 3, 4]
+      self.assertAllEqual(fft_ops.fftshift(x).eval(), y)
+      self.assertAllEqual(fft_ops.ifftshift(y).eval(), x)
+      x = [0, 1, 2, 3, 4, -5, -4, -3, -2, -1]
+      y = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
+      self.assertAllEqual(fft_ops.fftshift(x).eval(), y)
+      self.assertAllEqual(fft_ops.ifftshift(y).eval(), x)
+
+  @test_util.run_deprecated_v1
+  def testAxesKeyword(self):
+    with self.session():
+      freqs = [[0, 1, 2], [3, 4, -4], [-3, -2, -1]]
+      shifted = [[-1, -3, -2], [2, 0, 1], [-4, 3, 4]]
+      self.assertAllEqual(fft_ops.fftshift(freqs, axes=(0, 1)).eval(), shifted)
+      self.assertAllEqual(
+          fft_ops.fftshift(freqs, axes=0).eval(),
+          fft_ops.fftshift(freqs, axes=(0,)).eval())
+      self.assertAllEqual(fft_ops.ifftshift(shifted, axes=(0, 1)).eval(), freqs)
+      self.assertAllEqual(
+          fft_ops.ifftshift(shifted, axes=0).eval(),
+          fft_ops.ifftshift(shifted, axes=(0,)).eval())
+      self.assertAllEqual(fft_ops.fftshift(freqs).eval(), shifted)
+      self.assertAllEqual(fft_ops.ifftshift(shifted).eval(), freqs)
+
+  @test_util.run_deprecated_v1
+  def testNumpyCompatibility(self):
+    with self.session():
+      x = [0, 1, 2, 3, 4, -4, -3, -2, -1]
+      y = [-4, -3, -2, -1, 0, 1, 2, 3, 4]
+      self.assertAllEqual(fft_ops.fftshift(x).eval(), np.fft.fftshift(x))
+      self.assertAllEqual(fft_ops.ifftshift(y).eval(), np.fft.ifftshift(y))
+      x = [0, 1, 2, 3, 4, -5, -4, -3, -2, -1]
+      y = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
+      self.assertAllEqual(fft_ops.fftshift(x).eval(), np.fft.fftshift(x))
+      self.assertAllEqual(fft_ops.ifftshift(y).eval(), np.fft.ifftshift(y))
+      freqs = [[0, 1, 2], [3, 4, -4], [-3, -2, -1]]
+      shifted = [[-1, -3, -2], [2, 0, 1], [-4, 3, 4]]
+      self.assertAllEqual(
+          fft_ops.fftshift(freqs, axes=(0, 1)).eval(),
+          np.fft.fftshift(freqs, axes=(0, 1)))
+      self.assertAllEqual(
+          fft_ops.ifftshift(shifted, axes=(0, 1)).eval(),
+          np.fft.ifftshift(shifted, axes=(0, 1)))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index e96bc09f365..976880c10ee 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -235,6 +235,11 @@ class SpaceToDepthTest(test.TestCase):
 
   def spaceToDepthUsingTranspose(self, tensor, block_size, data_format):
     block_size_sq = block_size * block_size
+
+    dtype = tensor.dtype
+    if dtype == dtypes.qint8:
+      tensor = array_ops.bitcast(tensor, dtypes.int8)
+
     if data_format == "NHWC":
       b, ih, iw, ic = tensor.shape.as_list()
       assert ih % block_size == 0, (ih, block_size)
@@ -253,56 +258,87 @@ class SpaceToDepthTest(test.TestCase):
                                  [b, ic, oh, block_size, ow, block_size])
       tensor = array_ops.transpose(tensor, [0, 3, 5, 1, 2, 4])
       tensor = array_ops.reshape(tensor, [b, oc, oh, ow])
+
+    if dtype == dtypes.qint8:
+      tensor = array_ops.bitcast(tensor, dtype)
     return tensor
 
   def compareToTranspose(self, batch_size, out_height, out_width, in_channels,
-                         block_size, data_format, use_gpu):
+                         block_size, data_format, data_type, use_gpu):
     in_height = out_height * block_size
     in_width = out_width * block_size
     nhwc_input_shape = [batch_size, in_height, in_width, in_channels]
     nchw_input_shape = [batch_size, in_channels, in_height, in_width]
     total_size = np.prod(nhwc_input_shape)
 
-    if data_format == "NCHW_VECT_C":
-      # Initialize the input tensor with qint8 values that circle -127..127.
-      x = [((f + 128) % 255) - 127 for f in range(total_size)]
-      t = constant_op.constant(x, shape=nhwc_input_shape, dtype=dtypes.float32)
-      expected = self.spaceToDepthUsingTranspose(t, block_size, "NHWC")
-      t = test_util.NHWCToNCHW_VECT_C(t)
-      t, _, _ = gen_array_ops.quantize_v2(t, -128.0, 127.0, dtypes.qint8)
-      t = array_ops.space_to_depth(t, block_size, data_format="NCHW_VECT_C")
-      t = gen_array_ops.dequantize(t, -128, 127)
-      actual = test_util.NCHW_VECT_CToNHWC(t)
-    else:
-      # Initialize the input tensor with ascending whole numbers as floats.
-      x = [f * 1.0 for f in range(total_size)]
-      shape = nchw_input_shape if data_format == "NCHW" else nhwc_input_shape
-      t = constant_op.constant(x, shape=shape, dtype=dtypes.float32)
-      expected = self.spaceToDepthUsingTranspose(t, block_size, data_format)
-      actual = array_ops.space_to_depth(t, block_size, data_format=data_format)
+    # Construct the input tensor in data_type and NHWC.
+    # force_cpu is needed because quantize_v2 runs on only CPU.
+    with test_util.force_cpu():
+      if data_type == dtypes.qint8:
+        # Initialize the input tensor with qint8 values that circle -127..127.
+        x = [((f + 128) % 255) - 127 for f in range(total_size)]
+        t = constant_op.constant(
+            x, shape=nhwc_input_shape, dtype=dtypes.float32)
+        t, _, _ = gen_array_ops.quantize_v2(t, -128.0, 127.0, dtypes.qint8)
+      else:
+        assert data_type == dtypes.float32
+        # Initialize the input tensor with ascending whole numbers as floats.
+        x = [f * 1.0 for f in range(total_size)]
+        shape = nchw_input_shape if data_format == "NCHW" else nhwc_input_shape
+        t = constant_op.constant(x, shape=shape, dtype=dtypes.float32)
+
+    with test_util.device(use_gpu):
+      if data_format == "NCHW_VECT_C":
+        assert data_type == dtypes.qint8
+
+        # Convert to int8, then NHWCToNCHW_VECT_C, and then back to qint8.
+        actual = array_ops.bitcast(t, dtypes.int8)
+        actual = test_util.NHWCToNCHW_VECT_C(actual)
+        actual = array_ops.bitcast(actual, dtypes.qint8)
+        actual = array_ops.space_to_depth(
+            actual, block_size, data_format=data_format)
+        actual = array_ops.bitcast(actual, dtypes.int8)
+        actual = test_util.NCHW_VECT_CToNHWC(actual)
+        actual = array_ops.bitcast(actual, dtypes.qint8)
+
+        expected = array_ops.bitcast(t, dtypes.int8)
+        expected = math_ops.cast(expected, dtypes.float32)
+        expected = self.spaceToDepthUsingTranspose(expected, block_size, "NHWC")
+        expected = math_ops.cast(expected, dtypes.int8)
+        expected = array_ops.bitcast(expected, dtypes.qint8)
+      else:
+        # Initialize the input tensor with ascending whole numbers as floats.
+        actual = array_ops.space_to_depth(
+            t, block_size, data_format=data_format)
+        expected = self.spaceToDepthUsingTranspose(t, block_size, data_format)
 
-    with self.cached_session(use_gpu=use_gpu) as sess:
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
+  # TODO(jingyue): figure out why this test failed in eager mode.
+  @test_util.run_deprecated_v1
   def testAgainstTranspose(self):
-    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
-    self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", False)
-    self.compareToTranspose(1, 2, 3, 2, 3, "NHWC", False)
+    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.float32, False)
+    self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", dtypes.float32, False)
+    self.compareToTranspose(1, 2, 3, 2, 3, "NHWC", dtypes.float32, False)
+
+    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.qint8, False)
+    self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", dtypes.qint8, False)
+    self.compareToTranspose(1, 2, 3, 2, 3, "NHWC", dtypes.qint8, False)
 
     if not test.is_gpu_available():
       tf_logging.info("skipping gpu tests since gpu not available")
       return
 
-    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", True)
-    self.compareToTranspose(3, 2, 3, 2, 2, "NHWC", True)
-    self.compareToTranspose(3, 2, 3, 1, 2, "NCHW", True)
-    self.compareToTranspose(3, 2, 3, 2, 3, "NCHW", True)
-    self.compareToTranspose(5, 7, 11, 3, 2, "NCHW", True)
+    self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.float32, True)
+    self.compareToTranspose(3, 2, 3, 2, 2, "NHWC", dtypes.float32, True)
+    self.compareToTranspose(3, 2, 3, 1, 2, "NCHW", dtypes.float32, True)
+    self.compareToTranspose(3, 2, 3, 2, 3, "NCHW", dtypes.float32, True)
+    self.compareToTranspose(5, 7, 11, 3, 2, "NCHW", dtypes.float32, True)
 
-    self.compareToTranspose(3, 2, 3, 4, 2, "NCHW_VECT_C", True)
-    self.compareToTranspose(3, 2, 3, 8, 3, "NCHW_VECT_C", True)
-    self.compareToTranspose(5, 7, 11, 12, 2, "NCHW_VECT_C", True)
+    self.compareToTranspose(3, 2, 3, 4, 2, "NCHW_VECT_C", dtypes.qint8, True)
+    self.compareToTranspose(3, 2, 3, 8, 3, "NCHW_VECT_C", dtypes.qint8, True)
+    self.compareToTranspose(5, 7, 11, 12, 2, "NCHW_VECT_C", dtypes.qint8, True)
 
 
 class SpaceToDepthGradientTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/sparse_concat_op_test.py b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
index 04b6b9b8d20..0d07cf19403 100644
--- a/tensorflow/python/kernel_tests/sparse_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
@@ -342,6 +342,15 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape().as_list(), [None])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [3])
 
+  def testConcatShape(self):
+    # Test case for GitHub 21964.
+    x = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 1]], values=[1, 2], dense_shape=[2, 2])
+    y = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 1]], values=[1, 2], dense_shape=[2, 2])
+    z = sparse_ops.sparse_concat(-1, [x, y])
+    self.assertEqual(z.get_shape().as_list(), [2, 4])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/testdata/BUILD b/tensorflow/python/kernel_tests/testdata/BUILD
index 45264c773ac..1a6aac87dbb 100644
--- a/tensorflow/python/kernel_tests/testdata/BUILD
+++ b/tensorflow/python/kernel_tests/testdata/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 filegroup(
     name = "self_adjoint_eig_op_test_files",
     srcs = ["self_adjoint_eig_fail_if_denorms_flushed.txt"],
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index ea69dff50a1..47571b95ded 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -600,6 +600,38 @@ class VariablesTestCase(test.TestCase):
     with ops.get_default_graph().as_default():
       create_variable()
 
+  def testTrainableVariableV1(self):
+    v1 = variables.VariableV1(1.0)
+    self.assertEqual(True, v1.trainable)
+
+    v2 = variables.VariableV1(
+        1.0, synchronization=variables.VariableSynchronization.ON_READ)
+    self.assertEqual(False, v2.trainable)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Synchronization value can be set to VariableSynchronization.ON_READ "
+        "only for non-trainable variables"):
+      _ = variables.VariableV1(
+          1.0, trainable=True,
+          synchronization=variables.VariableSynchronization.ON_READ)
+
+  def testTrainableVariableV2(self):
+    v1 = variables.Variable(1.0)
+    self.assertEqual(True, v1.trainable)
+
+    v2 = variables.Variable(
+        1.0, synchronization=variables.VariableSynchronization.ON_READ)
+    self.assertEqual(False, v2.trainable)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Synchronization value can be set to VariableSynchronization.ON_READ "
+        "only for non-trainable variables"):
+      _ = variables.Variable(
+          1.0, trainable=True,
+          synchronization=variables.VariableSynchronization.ON_READ)
+
 
 class IsInitializedTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index 56c13904113..00d916c06eb 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -37,10 +37,10 @@ from tensorflow.python.platform import test
 
 class WhereOpTest(test.TestCase):
 
-  def _testWhere(self, x, truth, expected_err_re=None):
+  def _testWhere(self, x, truth, expected_err_re=None, fn=array_ops.where):
     with self.cached_session(use_gpu=True):
-      ans = array_ops.where(x)
-      self.assertEqual([None, x.ndim], ans.get_shape().as_list())
+      ans = fn(x)
+      self.assertTrue(ans.get_shape().is_compatible_with([None, x.ndim]))
       if expected_err_re is None:
         tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
@@ -48,44 +48,40 @@ class WhereOpTest(test.TestCase):
         with self.assertRaisesOpError(expected_err_re):
           self.evaluate(ans)
 
-  def testWrongNumbers(self):
+  def _testWrongNumbers(self, fn=array_ops.where):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
-        array_ops.where([False, True], [1, 2], None)
+        fn([False, True], [1, 2], None)
       with self.assertRaises(ValueError):
-        array_ops.where([False, True], None, [1, 2])
+        fn([False, True], None, [1, 2])
 
-  @test_util.run_deprecated_v1
-  def testBasicVec(self):
+  def _testBasicVec(self, fn=array_ops.where):
     x = np.asarray([True, False])
     truth = np.asarray([[0]], dtype=np.int64)
-    self._testWhere(x, truth)
+    self._testWhere(x, truth, None, fn)
 
     x = np.asarray([False, True, False])
     truth = np.asarray([[1]], dtype=np.int64)
-    self._testWhere(x, truth)
+    self._testWhere(x, truth, None, fn)
 
     x = np.asarray([False, False, True, False, True])
     truth = np.asarray([[2], [4]], dtype=np.int64)
-    self._testWhere(x, truth)
+    self._testWhere(x, truth, None, fn)
 
-  @test_util.run_deprecated_v1
-  def testRandomVec(self):
+  def _testRandomVec(self, fn=array_ops.where):
     x = np.random.rand(1000000) > 0.5
     truth = np.vstack([np.where(x)[0].astype(np.int64)]).T
-    self._testWhere(x, truth)
+    self._testWhere(x, truth, None, fn)
 
-  @test_util.run_deprecated_v1
-  def testBasicMat(self):
+  def _testBasicMat(self, fn=array_ops.where):
     x = np.asarray([[True, False], [True, False]])
 
     # Ensure RowMajor mode
     truth = np.asarray([[0, 0], [1, 0]], dtype=np.int64)
 
-    self._testWhere(x, truth)
+    self._testWhere(x, truth, None, fn)
 
-  @test_util.run_deprecated_v1
-  def testBasic3Tensor(self):
+  def _testBasic3Tensor(self, fn=array_ops.where):
     x = np.asarray([[[True, False], [True, False]],
                     [[False, True], [False, True]],
                     [[False, False], [False, True]]])
@@ -94,15 +90,41 @@ class WhereOpTest(test.TestCase):
     truth = np.asarray(
         [[0, 0, 0], [0, 1, 0], [1, 0, 1], [1, 1, 1], [2, 1, 1]], dtype=np.int64)
 
-    self._testWhere(x, truth)
+    self._testWhere(x, truth, None, fn)
 
-  def _testRandom(self, dtype, expected_err_re=None):
+  def _testRandom(self, dtype, expected_err_re=None, fn=array_ops.where):
     shape = [127, 33, 53]
     x = np.random.randn(*shape) + 1j * np.random.randn(*shape)
     x = (np.random.randn(*shape) > 0).astype(dtype)
     truth = np.where(np.abs(x) > 0)  # Tuples of indices by axis.
     truth = np.vstack(truth).T  # Convert to [num_true, indices].
-    self._testWhere(x, truth, expected_err_re)
+    self._testWhere(x, truth, expected_err_re, fn)
+
+  def _testThreeArgument(self, fn=array_ops.where):
+    x = np.array([[-2, 3, -1], [1, -3, -3]])
+    np_val = np.where(x > 0, x * x, -x)
+    with self.test_session(use_gpu=True):
+      tf_val = self.evaluate(fn(constant_op.constant(x) > 0, x * x, -x))
+    self.assertAllEqual(tf_val, np_val)
+
+  def testWrongNumbers(self):
+    self._testWrongNumbers()
+
+  @test_util.run_deprecated_v1
+  def testBasicVec(self):
+    self._testBasicVec()
+
+  @test_util.run_deprecated_v1
+  def testRandomVec(self):
+    self._testRandomVec()
+
+  @test_util.run_deprecated_v1
+  def testBasicMat(self):
+    self._testBasicMat()
+
+  @test_util.run_deprecated_v1
+  def testBasic3Tensor(self):
+    self._testBasic3Tensor()
 
   @test_util.run_deprecated_v1
   def testRandomBool(self):
@@ -146,12 +168,95 @@ class WhereOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testThreeArgument(self):
-    x = np.array([[-2, 3, -1], [1, -3, -3]])
-    np_val = np.where(x > 0, x * x, -x)
-    with self.session(use_gpu=True):
-      tf_val = array_ops.where(constant_op.constant(x) > 0, x * x, -x).eval()
+    self._testThreeArgument()
+
+  def testV2WrongNumbers(self):
+    self._testWrongNumbers(array_ops.where_v2)
+
+  def testV2BasicVec(self):
+    self._testBasicVec(array_ops.where_v2)
+
+  def testV2RandomVec(self):
+    self._testRandomVec(array_ops.where_v2)
+
+  def testV2BasicMat(self):
+    self._testBasicMat(array_ops.where_v2)
+
+  def testV2Basic3Tensor(self):
+    self._testBasic3Tensor(array_ops.where_v2)
+
+  def testV2RandomBool(self):
+    self._testRandom(np.bool, None, array_ops.where_v2)
+
+  def testV2RandomInt32(self):
+    self._testRandom(np.int32, None, array_ops.where_v2)
+
+  def testV2RandomInt64(self):
+    self._testRandom(np.int64, None, array_ops.where_v2)
+
+  def testV2RandomFloat(self):
+    self._testRandom(np.float32, None, array_ops.where_v2)
+
+  def testV2RandomDouble(self):
+    self._testRandom(np.float64, None, array_ops.where_v2)
+
+  def testV2RandomComplex64(self):
+    self._testRandom(np.complex64, None, array_ops.where_v2)
+
+  def testV2RandomComplex128(self):
+    self._testRandom(np.complex128, None, array_ops.where_v2)
+
+  def testV2RandomUint8(self):
+    self._testRandom(np.uint8, None, array_ops.where_v2)
+
+  def testV2RandomInt8(self):
+    self._testRandom(np.int8, None, array_ops.where_v2)
+
+  def testV2RandomInt16(self):
+    self._testRandom(np.int16, None, array_ops.where_v2)
+
+  def testV2ThreeArgument(self):
+    self._testThreeArgument(array_ops.where_v2)
+
+  def testV2Broadcasting(self):
+    f = np.random.normal(0, 1, (3, 5, 1, 1))
+    x = np.zeros((7, 11))
+    y = np.ones((7, 11))
+    np_val = np.where(f < 0, x, y)
+    with self.test_session(use_gpu=True):
+      tf_val = self.evaluate(
+          array_ops.where_v2(constant_op.constant(f) < 0, x, y))
     self.assertAllEqual(tf_val, np_val)
 
+  def testV2ScalarBroadcasting(self):
+    x = np.zeros((7, 11))
+    y = np.ones((7, 11))
+    np_val = np.where(True, x, y)
+    with self.test_session(use_gpu=True):
+      tf_val = self.evaluate(
+          array_ops.where_v2(
+              constant_op.constant(True, dtype=dtypes.bool), x, y))
+    self.assertAllEqual(tf_val, np_val)
+
+  def testV2VectorBroadcasting(self):
+    x = np.zeros(7)
+    y = np.ones(7)
+    np_val = np.where([True], x, y)
+    with self.test_session(use_gpu=True):
+      tf_val = self.evaluate(
+          array_ops.where_v2(
+              constant_op.constant([True], dtype=dtypes.bool), x, y))
+    self.assertAllEqual(tf_val, np_val)
+
+  def testV2PredBroadcasting(self):
+    pred = np.array([1, 0, 0]).reshape((3, 1))
+    x = np.random.randn(3, 4)
+    y = np.random.randn(3, 4)
+    np_val = np.where(pred, x, y)
+    with self.test_session(use_gpu=True):
+      tf_val = self.evaluate(array_ops.where_v2(pred, x, y))
+    self.assertAllClose(tf_val, np_val)
+
   @test_util.run_deprecated_v1
   def testBatchSelect(self):
     x = np.array([[-2, 3, -1] * 64, [1, -3, -3] * 64] * 8192)  # [16384, 192]
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index 22c7cb1c3ea..031c8cace6f 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -53,12 +53,29 @@ class XentTest(test.TestCase):
     l = -np.sum(labels * np.log(probs + 1.0e-20), axis=dim)
     return l, bp
 
-  def _testXent(self, np_features, np_labels, use_gpu=False):
+  # TODO(b/123860949): The values are constant folded for XLA, so placeholders
+  # are needed.
+  def _testXent(self,
+                np_features,
+                np_labels,
+                use_gpu=False,
+                with_placeholders=False):
     np_loss, np_backprop = self._npXent(np_features, np_labels)
     with self.cached_session(use_gpu=use_gpu) as sess:
-      loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
-          np_features, np_labels)
-      tf_loss, tf_backprop = self.evaluate([loss, backprop])
+      if with_placeholders:
+        features_placeholder = array_ops.placeholder(np_features.dtype)
+        labels_placeholder = array_ops.placeholder(np_labels.dtype)
+        loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
+            labels=labels_placeholder, features=features_placeholder)
+        tf_loss, tf_backprop = sess.run([loss, backprop],
+                                        feed_dict={
+                                            labels_placeholder: np_labels,
+                                            features_placeholder: np_features
+                                        })
+      else:
+        loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
+            np_features, np_labels)
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
@@ -72,9 +89,13 @@ class XentTest(test.TestCase):
     print("tf_loss:", tf_loss)
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
 
-  def _testAll(self, features, labels):
-    self._testXent(features, labels, use_gpu=False)
-    self._testXent(features, labels, use_gpu=True)
+  # TODO(b/123860949): The values are constant folded for XLA, so placeholders
+  # are needed.
+  def _testAll(self, features, labels, with_placeholders=False):
+    self._testXent(
+        features, labels, use_gpu=False, with_placeholders=with_placeholders)
+    self._testXent(
+        features, labels, use_gpu=True, with_placeholders=with_placeholders)
 
   def _testSingleClass(self, use_gpu=False):
     for dtype in np.float16, np.float32:
@@ -155,6 +176,19 @@ class XentTest(test.TestCase):
       self.assertAllCloseAccordingToType(np_loss, tf_loss)
       self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
+  # TODO(b/123860949): The values are constant folded for XLA, so placeholders
+  # are needed.
+  @test_util.run_deprecated_v1
+  def testFeatureBroadcast(self):
+    self._testAll(
+        np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float16),
+        np.array([[0., 0., 0., 1.]]).astype(np.float16),
+        with_placeholders=True)
+    self._testAll(
+        np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float16),
+        np.array([[0.], [2.]]).astype(np.float16),
+        with_placeholders=True)
+
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     with self.cached_session():
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 1c05e222606..8aaaf645ae8 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -64,7 +64,7 @@ def keras_style_scope():
   class RNNModel(tf.keras.Model):
 
     def __init__(self, name):
-      super(RNNModel, self.).__init__(name=name)
+      super(RNNModel, self).__init__(name=name)
       self.rnn = tf.compat.v1.nn.rnn_cell.MultiRNNCell(
         [tf.compat.v1.nn.rnn_cell.LSTMCell(64) for _ in range(2)])
 
@@ -541,6 +541,32 @@ class Layer(base_layer.Layer):
       _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
     return outputs
 
+  def add_update(self, updates, inputs=None):
+    if callable(updates):
+      updates = updates()
+
+    if context.executing_eagerly():
+      return  # Updates already applied when in eager mode.
+
+    def process_update(x):
+      if isinstance(x, ops.Operation):
+        return x
+      elif hasattr(x, 'op'):
+        return x.op
+      else:
+        return ops.convert_to_tensor(x)
+
+    if not isinstance(updates, list):
+      updates = [updates]
+    updates = [process_update(x) for x in updates]
+    self._updates += updates
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
+    else:
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
+
   def __deepcopy__(self, memo):
     no_copy = set(['_graph'])
     shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 5765b175944..6854175180c 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -407,9 +407,7 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
                PyArray_NBYTES(py_array));
   }
 
-  // PyArray_Return turns rank 0 arrays into numpy scalars
-  *out_ndarray = PyArray_Return(
-      reinterpret_cast<PyArrayObject*>(safe_out_array.release()));
+  *out_ndarray = safe_out_array.release();
   return Status::OK();
 }
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index bf584bc1b9e..16f69341861 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -188,13 +188,9 @@ Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
 
 Status ArrayFromMemory(int dim_size, npy_intp* dims, void* data, DataType dtype,
                        std::function<void()> destructor, PyObject** result) {
-  int size = 1;
-  for (int i = 0; i < dim_size; ++i) {
-    size *= dims[i];
-  }
-  if (dtype == DT_STRING || dtype == DT_RESOURCE || size == 0) {
+  if (dtype == DT_STRING || dtype == DT_RESOURCE) {
     return errors::FailedPrecondition(
-        "Cannot convert strings, resources, or empty Tensors.");
+        "Cannot convert string or resource Tensors.");
   }
 
   int type_num = -1;
@@ -218,7 +214,7 @@ Status ArrayFromMemory(int dim_size, npy_intp* dims, void* data, DataType dtype,
     Py_DECREF(releaser);
     return errors::Unknown("Python array refused to use memory.");
   }
-  *result = PyArray_Return(np_array);
+  *result = reinterpret_cast<PyObject*>(np_array);
   return Status::OK();
 }
 
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 71b1577471a..11ac437d3c9 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -92,6 +92,7 @@ Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
         Py_DECREF(lst);
         return s;
       }
+      arg = PyArray_Return(reinterpret_cast<PyArrayObject*>(arg));
     }
     PyList_SetItem(lst, i, arg);
   }
@@ -467,7 +468,7 @@ Status ConvertTensorToNdarray(const Tensor& t, PyObject** ret) {
     StringPiece p = t.tensor_data();
     memcpy(PyArray_DATA(np_array), p.data(), p.size());
   }
-  *ret = PyArray_Return(np_array);
+  *ret = reinterpret_cast<PyObject*>(np_array);
   return Status::OK();
 }
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 8b54a52b649..fed88004ee4 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -69,7 +69,7 @@ class TFRecordOptions(object):
     Leaving an option as `None` allows C++ to set a reasonable default.
 
     Args:
-      compression_type: `"GZIP"`, "ZLIB" or `"NONE"`.
+      compression_type: `"GZIP"`, `"ZLIB"`, or `""` (no compression).
       flush_mode: flush mode or `None`, Default: Z_NO_FLUSH.
       input_buffer_size: int or `None`.
       output_buffer_size: int or `None`.
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index 55909cc2672..e2c8f6c8370 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -1,9 +1,10 @@
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_library(
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 12c9178301b..19a1ab6819c 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -853,21 +853,7 @@ def strided_slice(input_,
       """Closure that holds all the arguments to create an assignment."""
 
       if var is None:
-        if name is None:
-          name = parent_name + "_strided_slice_update"
-
-        return gen_array_ops.tensor_strided_slice_update(
-            input=var,
-            begin=begin,
-            end=end,
-            strides=strides,
-            value=val,
-            name=name,
-            begin_mask=begin_mask,
-            end_mask=end_mask,
-            ellipsis_mask=ellipsis_mask,
-            new_axis_mask=new_axis_mask,
-            shrink_axis_mask=shrink_axis_mask)
+        raise ValueError("Sliced assignment is only supported for variables")
       else:
         if name is None:
           name = parent_name + "_assign"
@@ -3214,7 +3200,11 @@ def squeeze_v2(input, axis=None, name=None):
   return squeeze(input, axis, name)
 
 
-@tf_export("where")
+@tf_export(v1=["where"])
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.where in 2.0, "
+    "which has the same broadcast rule as np.where")
 @dispatch.add_dispatch_support
 def where(condition, x=None, y=None, name=None):
   """Return the elements, either from `x` or `y`, depending on the `condition`.
@@ -3268,6 +3258,48 @@ def where(condition, x=None, y=None, name=None):
     raise ValueError("x and y must both be non-None or both be None.")
 
 
+@tf_export("where", v1=["where_v2"])
+def where_v2(condition, x=None, y=None, name=None):
+  """Return the elements, either from `x` or `y`, depending on the `condition`.
+
+  If both `x` and `y` are None, then this operation returns the coordinates of
+  true elements of `condition`.  The coordinates are returned in a 2-D tensor
+  where the first dimension (rows) represents the number of true elements, and
+  the second dimension (columns) represents the coordinates of the true
+  elements. Keep in mind, the shape of the output tensor can vary depending on
+  how many true values there are in input. Indices are output in row-major
+  order.
+  If both non-None, `condition`, `x` and `y` must be broadcastable to the same
+  shape.
+  The `condition` tensor acts as a mask that chooses, based on the value at each
+  element, whether the corresponding element / row in the output should be taken
+  from `x` (if true) or `y` (if false).
+  Args:
+    condition: A `Tensor` of type `bool`
+    x: A Tensor which is of the same type as `y`, and may be broadcastable with
+      `condition` and `y`.
+    y: A Tensor which is of the same type as `x`, and may be broadcastable with
+      `condition` and `x`.
+    name: A name of the operation (optional).
+
+  Returns:
+    A `Tensor` with the same type as `x` and `y`, and shape that
+      is broadcasted from `condition`, `x`, and `y`, if `x`, `y` are non-None.
+    A `Tensor` with shape `(num_true, dim_size(condition))`.
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-None.
+  """
+  if x is None and y is None:
+    with ops.name_scope(name, "Where", [condition]) as name:
+      condition = ops.convert_to_tensor(
+          condition, preferred_dtype=dtypes.bool, name="condition")
+      return gen_array_ops.where(condition=condition, name=name)
+  elif x is not None and y is not None:
+    return gen_math_ops.select_v2(condition=condition, t=x, e=y, name=name)
+  else:
+    raise ValueError("x and y must both be non-None or both be None.")
+
+
 # pylint: disable=redefined-builtin
 @tf_export(v1=["reverse_sequence"])
 @deprecation.deprecated_args(None,
@@ -3397,6 +3429,20 @@ def gather(params,
     A `Tensor`. Has the same type as `params`.
   """
   del validate_indices
+  if compat.forward_compatible(2019, 6, 10):
+    if axis is None:
+      axis = batch_dims
+    if axis != 0:
+      return gen_array_ops.gather_v2(
+          params, indices, axis, batch_dims=batch_dims, name=name)
+    try:
+      # TODO(apassos) find a less bad way of detecting resource variables
+      # without introducing a circular dependency.
+      return params.sparse_read(indices, name=name)
+    except AttributeError:
+      return gen_array_ops.gather_v2(
+          params, indices, axis, name=name)
+
   if batch_dims != 0:
     with ops.name_scope(name, "Gather", [params, indices, axis]):
       return _batch_gather(params, indices, batch_dims, axis)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 171e57e85ed..dc1ca4983f8 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -2124,6 +2124,12 @@ def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
   Singleton lists and tuples form the only exceptions to this: when returned by
   `true_fn` and/or `false_fn`, they are implicitly unpacked to single values.
 
+  Note: It is illegal to "directly" use tensors created inside a cond branch
+  outside it, e.g. by storing a reference to a branch tensor in the python
+  state. If you need to use a tensor created in a branch function you should
+  return it as an output of the branch function and use the output from
+  `tf.cond` instead.
+
   Args:
     pred: A scalar determining whether to return the result of `true_fn` or
       `false_fn`.
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index e6fdbe34ec5..9fe0c7cdeeb 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -27,6 +27,7 @@ import os
 import traceback
 
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 ENABLE_CONTROL_FLOW_V2 = (os.getenv("TF_ENABLE_CONTROL_FLOW_V2", "0") != "0" or
                           os.getenv("TF_ENABLE_COND_V2", "0") != "0" or
@@ -34,6 +35,35 @@ ENABLE_CONTROL_FLOW_V2 = (os.getenv("TF_ENABLE_CONTROL_FLOW_V2", "0") != "0" or
                           os.getenv("TF_ENABLE_TENSOR_ARRAY_V2", "0") != "0")
 
 
+@tf_export(v1=["enable_control_flow_v2"])
+def enable_control_flow_v2():  # pylint: disable=invalid-name
+  """Use control flow v2.
+
+  control flow v2 (cfv2) is an improved version of control flow in TensorFlow
+  with support for higher order derivatives. Enabling cfv2 will change the
+  graph/function representation of control flow, e.g., `tf.while_loop` and
+  `tf.cond` will generate functional `While` and `If` ops instead of low-level
+  `Switch`, `Merge` etc. ops. Note: Importing and running graphs exported
+  with old control flow will still be supported.
+
+  Calling tf.enable_control_flow_v2() lets you opt-in to this TensorFlow 2.0
+  feature.
+  """
+  global ENABLE_CONTROL_FLOW_V2
+  ENABLE_CONTROL_FLOW_V2 = True
+
+
+@tf_export(v1=["disable_control_flow_v2"])
+def disable_control_flow_v2():  # pylint: disable=invalid-name
+  """Opts out of control flow v2.
+
+  If your code needs tf.disable_control_flow_v2() to be called to work
+  properly please file a bug.
+  """
+  global ENABLE_CONTROL_FLOW_V2
+  ENABLE_CONTROL_FLOW_V2 = False
+
+
 def EnableControlFlowV2(graph):
   """Returns whether control flow v2 should be used in `graph`."""
   # Enable new control flow in FuncGraphs (but not legacy _FuncGraphs).
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 0ef72e1e927..3ea2cad12d2 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -210,11 +210,12 @@ def _graph_mode_decorator(f, *args, **kwargs):
       logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
                    "no ResourceVariables were used on the forward pass.")
   flat_result = nest.flatten(result)
+  flat_result_len = len(flat_result)
   all_tensors = flat_result + args + variables
 
   def tape_grad_fn(*result_grads):
     """Custom grad fn wrapper."""
-    result_grads = result_grads[:len(flat_result)]
+    result_grads = result_grads[:flat_result_len]
     if variables:
       input_grads, variable_grads = grad_fn(*result_grads, variables=variables)
       if len(variable_grads) != len(variables):
@@ -228,7 +229,7 @@ def _graph_mode_decorator(f, *args, **kwargs):
     # gradients of the inputs of the custom_gradient function with the
     # gradients of the outputs as well.
     input_grads = nest.flatten(input_grads)
-    return ([None] * len(flat_result)) + input_grads + variable_grads
+    return ([None] * flat_result_len) + input_grads + variable_grads
 
   @ops.RegisterGradient(name)
   def internal_grad_fn(unused_op, *result_grads):  # pylint: disable=unused-variable
@@ -250,7 +251,7 @@ def _graph_mode_decorator(f, *args, **kwargs):
   for ot, t in zip(original_tensors, all_tensors):
     copy_handle_data(ot, t)
   return nest.pack_sequence_as(
-      structure=result, flat_sequence=all_tensors[:len(flat_result)])
+      structure=result, flat_sequence=all_tensors[:flat_result_len])
 
 
 def _eager_mode_decorator(f, *args, **kwargs):
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 59ba9aee59e..3aec4c18202 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -1,11 +1,10 @@
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-licenses(["notice"])  # Apache 2.0
-
 py_library(
     name = "distributions",
     srcs = glob(
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
index f4c090f51a3..354254cbd3d 100644
--- a/tensorflow/python/ops/gradients_util.py
+++ b/tensorflow/python/ops/gradients_util.py
@@ -1077,14 +1077,23 @@ def _AggregatedGrads(grads,
 
 
 def _AggregateIndexedSlicesGradients(grads):
-  """Aggregates gradients of type `IndexedSlices` by concatenation."""
+  """Aggregates gradients containing `IndexedSlices`s."""
   if len(grads) < 1:
     return None
   elif len(grads) == 1:
     return grads[0]
   else:
-    grads = math_ops._as_indexed_slices_list(  # pylint: disable=protected-access
-        [g for g in grads if g is not None])
+    grads = [g for g in grads if g is not None]
+    # If any gradient is a `Tensor`, sum them up and return a dense tensor
+    # object.
+    if any(isinstance(g, ops.Tensor) for g in grads):
+      return math_ops.add_n(grads)
+
+    # The following `_as_indexed_slices_list` casts ids of IndexedSlices into
+    # int64. It is to make sure the inputs of `concat` all have same the data
+    # type.
+    grads = math_ops._as_indexed_slices_list(grads)  # pylint: disable=protected-access
+
     grads = [_HandleNestedIndexedSlices(x) for x in grads]  # pylint: disable=protected-access
     # Form IndexedSlices out of the concatenated values and indices.
     concat_grad = ops.IndexedSlices(
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 79dc3bb7c4b..54b70c20785 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -1244,11 +1244,7 @@ class GlorotUniform(VarianceScaling):
                    "of passing it to the constructor", "dtype")
   def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotUniform, self).__init__(
-        scale=1.0,
-        mode="fan_avg",
-        distribution="uniform",
-        seed=seed,
-        dtype=dtype)
+        scale=1.0, mode="fan_avg", distribution="uniform", seed=seed)
 
   def get_config(self):
     return {"seed": self.seed, "dtype": self.dtype.name}
@@ -1281,11 +1277,7 @@ class GlorotNormal(VarianceScaling):
                    "of passing it to the constructor", "dtype")
   def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotNormal, self).__init__(
-        scale=1.0,
-        mode="fan_avg",
-        distribution="truncated_normal",
-        seed=seed,
-        dtype=dtype)
+        scale=1.0, mode="fan_avg", distribution="truncated_normal", seed=seed)
 
   def get_config(self):
     return {"seed": self.seed, "dtype": self.dtype.name}
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index 5df2d6b8381..374ef7d99ba 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -1,9 +1,8 @@
 package(
     default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 py_library(
     name = "linalg",
     srcs = glob(["*.py"]),
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 5a3d86c1b68..527a8d365aa 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -70,7 +70,7 @@ def logdet(matrix, name=None):
   # Compute the determinant of a matrix while reducing the chance of over- or
   underflow:
   A = ... # shape 10 x 10
-  det = tf.exp(tf.logdet(A))  # scalar
+  det = tf.exp(tf.linalg.logdet(A))  # scalar
   ```
 
   Args:
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index be190aeac89..6c88495827a 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -227,6 +227,10 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     diag_mat = array_ops.expand_dims(diag_term, -1)
     return diag_mat * x
 
+  def _matvec(self, x, adjoint=False):
+    diag_term = math_ops.conj(self._diag) if adjoint else self._diag
+    return diag_term * x
+
   def _determinant(self):
     return math_ops.reduce_prod(self._diag, axis=[-1])
 
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 046ea0dfb1e..56e8a894c24 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -26,8 +26,8 @@ from tensorflow.python.util.tf_export import tf_export
 # pylint: disable=protected-access
 @tf_export('roll', v1=['roll', 'manip.roll'])
 @deprecation.deprecated_endpoints('manip.roll')
-def roll(input, shift, axis):  # pylint: disable=redefined-builtin
-  return _gen_manip_ops.roll(input, shift, axis)
+def roll(input, shift, axis, name=None):  # pylint: disable=redefined-builtin
+  return _gen_manip_ops.roll(input, shift, axis, name)
 
 
 roll.__doc__ = _gen_manip_ops.roll.__doc__
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index d848fe49730..66ba2feff33 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -1309,6 +1309,30 @@ def _SelectGrad(op, grad):
       c, zeros, grad))
 
 
+@ops.RegisterGradient("SelectV2")
+def _SelectGradV2(op, grad):
+  c = op.inputs[0]
+  x = op.inputs[1]
+  y = op.inputs[2]
+  zeros = array_ops.zeros([], dtype=grad.dtype.base_dtype)
+  gx = array_ops.where_v2(c, grad, zeros)
+  x_shape = array_ops.shape(x)
+  output_shape = array_ops.shape(op.outputs[0])
+  # Reduce away broadcasted leading dims.
+  reduce_x, _ = gen_array_ops.broadcast_gradient_args(x_shape, output_shape)
+  gx = math_ops.reduce_sum(gx, keepdims=True, axis=reduce_x)
+  gx = array_ops.reshape(gx, x_shape)
+
+  gy = array_ops.where_v2(c, zeros, grad)
+  y_shape = array_ops.shape(y)
+  # Reduce away broadcasted leading dims.
+  reduce_y, _ = gen_array_ops.broadcast_gradient_args(y_shape, output_shape)
+  gy = math_ops.reduce_sum(gy, keepdims=True, axis=reduce_y)
+  gy = array_ops.reshape(gy, y_shape)
+
+  return (None, gx, gy)
+
+
 def _MatMulGradAgainstFirstOnly(op, grad):
   """Gradient for MatMul, only for the first input."""
   t_a = op.get_attr("transpose_a")
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index e3292e081fe..ceb59a2b422 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -76,6 +76,7 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
   # Note that synchronization "ON_READ" implies trainable=False.
   return variable_scope.variable(
       lambda: array_ops.zeros(shape, dtype),
+      trainable=False,
       collections=[
           ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
       ],
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 2aff9f09b4c..4f645243ebf 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -378,15 +379,20 @@ class BatchNormalizationTest(test.TestCase):
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
   def testInferenceShape5(self):
-    x_shape = [0, 131, 127, 6]
-    for dtype in [np.float16, np.float32]:
-      if test.is_gpu_available(cuda_only=True):
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      x_shape = [0, 131, 127, 6]
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
+          self._test_inference(
+              x_shape,
+              dtype, [131],
+              np.float32,
+              use_gpu=True,
+              data_format='NCHW')
+          self._test_inference(
+              x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
         self._test_inference(
-            x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
-        self._test_inference(
-            x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
-      self._test_inference(
-          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
+            x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
   def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -427,15 +433,20 @@ class BatchNormalizationTest(test.TestCase):
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
   def testTrainingShape5(self):
-    x_shape = [0, 131, 127, 6]
-    for dtype in [np.float16, np.float32]:
-      if test.is_gpu_available(cuda_only=True):
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      x_shape = [0, 131, 127, 6]
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
+          self._test_training(
+              x_shape,
+              dtype, [131],
+              np.float32,
+              use_gpu=True,
+              data_format='NCHW')
+          self._test_training(
+              x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
         self._test_training(
-            x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
-        self._test_training(
-            x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
-      self._test_training(
-          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
+            x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
   @test_util.run_deprecated_v1
   def testBatchNormGradShape1(self):
@@ -531,31 +542,32 @@ class BatchNormalizationTest(test.TestCase):
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test never passed for XLA')
   def testBatchNormGradShape5(self):
-    for is_training in [True, False]:
-      x_shape = [0, 7, 11, 4]
-      for dtype in [np.float16, np.float32]:
-        if test.is_gpu_available(cuda_only=True):
-          self._test_gradient(
-              x_shape,
-              dtype, [7],
-              np.float32,
-              use_gpu=True,
-              data_format='NCHW',
-              is_training=is_training)
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      for is_training in [True, False]:
+        x_shape = [0, 7, 11, 4]
+        for dtype in [np.float16, np.float32]:
+          if test.is_gpu_available(cuda_only=True):
+            self._test_gradient(
+                x_shape,
+                dtype, [7],
+                np.float32,
+                use_gpu=True,
+                data_format='NCHW',
+                is_training=is_training)
+            self._test_gradient(
+                x_shape,
+                dtype, [4],
+                np.float32,
+                use_gpu=True,
+                data_format='NHWC',
+                is_training=is_training)
           self._test_gradient(
               x_shape,
               dtype, [4],
               np.float32,
-              use_gpu=True,
+              use_gpu=False,
               data_format='NHWC',
               is_training=is_training)
-        self._test_gradient(
-            x_shape,
-            dtype, [4],
-            np.float32,
-            use_gpu=False,
-            data_format='NHWC',
-            is_training=is_training)
 
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 7666ba23eae..8e02871154d 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -40,23 +40,27 @@ def _Conv2DBackpropInputGrad(op, grad):
   Returns:
     the gradients w.r.t. the input and the filter
   """
+  # We call the gen_nn_ops backprop functions instead of nn_ops backprop
+  # functions for performance reasons in Eager mode. See _Conv2DGrad.
   return [
       None,
-      nn_ops.conv2d_backprop_filter(
+      gen_nn_ops.conv2d_backprop_filter(
           grad,
           array_ops.shape(op.inputs[1]),
           op.inputs[2],
           dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
+          explicit_paddings=op.get_attr("explicit_paddings"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
           data_format=op.get_attr("data_format").decode()),
-      nn_ops.conv2d(
+      gen_nn_ops.conv2d(
           grad,
           op.inputs[1],
           dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
+          explicit_paddings=op.get_attr("explicit_paddings"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
           data_format=op.get_attr("data_format").decode())
   ]
@@ -64,22 +68,26 @@ def _Conv2DBackpropInputGrad(op, grad):
 
 @ops.RegisterGradient("Conv2DBackpropFilter")
 def _Conv2DBackpropFilterGrad(op, grad):
+  # We call the gen_nn_ops backprop functions instead of nn_ops backprop
+  # functions for performance reasons in Eager mode. See _Conv2DGrad.
   return [
-      nn_ops.conv2d_backprop_input(
+      gen_nn_ops.conv2d_backprop_input(
           array_ops.shape(op.inputs[0]),
           grad,
           op.inputs[2],
           dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
+          explicit_paddings=op.get_attr("explicit_paddings"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
           data_format=op.get_attr("data_format").decode()), None,
-      nn_ops.conv2d(
+      gen_nn_ops.conv2d(
           op.inputs[0],
           grad,
           dilations=op.get_attr("dilations"),
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
+          explicit_paddings=op.get_attr("explicit_paddings"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
           data_format=op.get_attr("data_format").decode())
   ]
@@ -827,15 +835,15 @@ def _BatchNormWithGlobalNormalizationGrad(op, grad):
   return dx, dm, dv, db, dg
 
 
-def _BaseFusedBatchNormGrad(op, use_v2, *grad):
+def _BaseFusedBatchNormGrad(op, version, *grad):
   """Return the gradients for the 3 inputs of BatchNorm.
 
   Args:
     op: The BatchNormOp for which we need to compute gradients.
-    use_v2: Boolean indicating whether to use the V2 version of the fused batch
+    version: Integer indicating which version to use of the fused batch
       norm gradient.
-    *grad: An argument list for tensors of gradients wrt the outputs with
-      grad[0] as grad_y.
+    *grad: An argument list for tensors of gradients wrt the outputs
+      with grad[0] as grad_y.
 
   Returns:
     grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) *
@@ -858,47 +866,62 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
   epsilon = op.get_attr("epsilon")
   data_format = op.get_attr("data_format")
   is_training = op.get_attr("is_training")
-  grad_fun = (
-      gen_nn_ops.fused_batch_norm_grad_v2
-      if use_v2 else gen_nn_ops.fused_batch_norm_grad)
+  if version == 2:
+    grad_fun = gen_nn_ops.fused_batch_norm_grad_v3
+  elif version == 1:
+    grad_fun = gen_nn_ops.fused_batch_norm_grad_v2
+  else:
+    grad_fun = gen_nn_ops.fused_batch_norm_grad
   if is_training:
-    return grad_fun(
-        grad_y,
-        x,
-        scale,
-        op.outputs[3],
-        op.outputs[4],
-        epsilon=epsilon,
-        data_format=data_format,
-        is_training=is_training)
+    args = {
+        "y_backprop": grad_y,
+        "x": x,
+        "scale": scale,
+        "reserve_space_1": op.outputs[3],
+        "reserve_space_2": op.outputs[4],
+        "epsilon": epsilon,
+        "data_format": data_format,
+        "is_training": is_training
+    }
+    if version == 2:
+      args["reserve_space_3"] = op.outputs[5]
+    return grad_fun(**args)
   else:
     pop_mean = op.inputs[3]
     pop_var = op.inputs[4]
     if data_format == b"NCHW":
       x = array_ops.transpose(x, [0, 2, 3, 1])
       grad_y = array_ops.transpose(grad_y, [0, 2, 3, 1])
-    dx, dscale, doffset, _, _ = grad_fun(
-        grad_y,
-        x,
-        scale,
-        pop_mean,
-        pop_var,
-        epsilon=epsilon,
-        data_format="NHWC",
-        is_training=is_training)
+    args = {
+        "y_backprop": grad_y,
+        "x": x,
+        "scale": scale,
+        "reserve_space_1": pop_mean,
+        "reserve_space_2": pop_var,
+        "epsilon": epsilon,
+        "data_format": "NHWC",
+        "is_training": is_training
+    }
+    if version == 2:
+      args["reserve_space_3"] = op.outputs[5]
+    dx, dscale, doffset, _, _ = grad_fun(**args)
     if data_format == b"NCHW":
       dx = array_ops.transpose(dx, [0, 3, 1, 2])
     return dx, dscale, doffset, None, None
 
-
 @ops.RegisterGradient("FusedBatchNorm")
 def _FusedBatchNormGrad(op, *grad):
-  return _BaseFusedBatchNormGrad(op, False, *grad)
+  return _BaseFusedBatchNormGrad(op, 0, *grad)
 
 
 @ops.RegisterGradient("FusedBatchNormV2")
 def _FusedBatchNormV2Grad(op, *grad):
-  return _BaseFusedBatchNormGrad(op, True, *grad)
+  return _BaseFusedBatchNormGrad(op, 1, *grad)
+
+
+@ops.RegisterGradient("FusedBatchNormV3")
+def _FusedBatchNormV3Grad(op, *grad):
+  return _BaseFusedBatchNormGrad(op, 2, *grad)
 
 
 def _BatchNormGrad(grad_y,
@@ -1023,6 +1046,12 @@ def _FusedBatchNormGradGradV2(op, *grad):
   return _FusedBatchNormGradGrad(op, *grad)
 
 
+@ops.RegisterGradient("FusedBatchNormGradV3")
+def _FusedBatchNormGradGradV3(op, *grad):
+  grad_grad_y, grad_x, grad_scale, _, _ = _FusedBatchNormGradGrad(op, *grad)
+  return grad_grad_y, grad_x, grad_scale, None, None, None
+
+
 @ops.RegisterGradient("L2Loss")
 def _L2LossGrad(op, grad):
   """Return the gradients for L2Loss.
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index cd461308221..126daeb3d14 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import math
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -1298,9 +1299,20 @@ def fused_batch_norm(
   # prevent exception (see cudnn.h).
   min_epsilon = 1.001e-5
   epsilon = epsilon if epsilon > min_epsilon else min_epsilon
-  # TODO(reedwm): In a few weeks, switch to using the V2 version exclusively. We
-  # currently only use the V2 version for float16 inputs, which is not supported
-  # by the V1 version.
+
+  if compat.forward_compatible(2019, 6, 6):
+    y, batch_mean, batch_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
+        x,
+        scale,
+        offset,
+        mean,
+        variance,
+        epsilon=epsilon,
+        data_format=data_format,
+        is_training=is_training,
+        name=name)
+    return y, batch_mean, batch_var
+
   if x.dtype == dtypes.float16 or x.dtype == dtypes.bfloat16:
     fused_batch_norm_func = gen_nn_ops.fused_batch_norm_v2
   else:
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 05beacf3f15..d0f61cfe456 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -2168,9 +2168,9 @@ def conv2d_transpose_v2(
     input: A 4-D `Tensor` of type `float` and shape `[batch, height, width,
       in_channels]` for `NHWC` data format or `[batch, in_channels, height,
       width]` for `NCHW` data format.
-    filters: A 4-D `Tensor` with the same type as `value` and shape `[height,
+    filters: A 4-D `Tensor` with the same type as `input` and shape `[height,
       width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
-      must match that of `value`.
+      must match that of `input`.
     output_shape: A 1-D `Tensor` representing the output shape of the
       deconvolution op.
     strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
@@ -2192,7 +2192,7 @@ def conv2d_transpose_v2(
     name: Optional name for the returned tensor.
 
   Returns:
-    A `Tensor` with the same type as `value`.
+    A `Tensor` with the same type as `input`.
 
   Raises:
     ValueError: If input/output depth does not match `filter`'s shape, or if
@@ -4755,11 +4755,11 @@ def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
   This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-  prediction for the target class is among the top `k` predictions among
-  all predictions for example `i`. Note that the behavior of `InTopK` differs
-  from the `TopK` op in its handling of ties; if multiple classes have the
-  same prediction value and straddle the top-`k` boundary, all of those
-  classes are considered to be in the top `k`.
+  prediction for the target class is finite (not inf, -inf, or nan) and among
+  the top `k` predictions among all predictions for example `i`. Note that the
+  behavior of `InTopK` differs from the `TopK` op in its handling of ties; if
+  multiple classes have the same prediction value and straddle the top-`k`
+  boundary, all of those classes are considered to be in the top `k`.
 
   More formally, let
 
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 82ab32ac0d4..71d13774efe 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -2,12 +2,11 @@ package(
     default_visibility = [
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-licenses(["notice"])  # Apache 2.0
-
 py_library(
     name = "parallel_for",
     srcs = [
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index ab98ef0e44b..cf870e19051 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -28,6 +28,7 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -344,54 +345,55 @@ class NNTest(PForTestCase):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
 
   def test_fused_batch_norm(self):
-    data_formats = ["NHWC"]
-    if test.is_gpu_available():
-      data_formats.append("NCHW")
-    for is_training in (True, False):
-      for data_format in data_formats:
-        with backprop.GradientTape(persistent=True) as g:
-          if data_format == "NCHW":
-            x = random_ops.random_uniform([3, 1, 2, 5, 5])
-          else:
-            x = random_ops.random_uniform([3, 1, 5, 5, 2])
-          g.watch(x)
-          scale = random_ops.random_uniform([2])
-          g.watch(scale)
-          offset = random_ops.random_uniform([2])
-          g.watch(offset)
-          mean = None if is_training else random_ops.random_uniform([2])
-          variance = None if is_training else random_ops.random_uniform([2])
+    with compat.forward_compatibility_horizon(2019, 6, 7):
+      data_formats = ["NHWC"]
+      if test.is_gpu_available():
+        data_formats.append("NCHW")
+      for is_training in (True, False):
+        for data_format in data_formats:
+          with backprop.GradientTape(persistent=True) as g:
+            if data_format == "NCHW":
+              x = random_ops.random_uniform([3, 1, 2, 5, 5])
+            else:
+              x = random_ops.random_uniform([3, 1, 5, 5, 2])
+            g.watch(x)
+            scale = random_ops.random_uniform([2])
+            g.watch(scale)
+            offset = random_ops.random_uniform([2])
+            g.watch(offset)
+            mean = None if is_training else random_ops.random_uniform([2])
+            variance = None if is_training else random_ops.random_uniform([2])
 
-        # pylint: disable=cell-var-from-loop
-        def loop_fn(i):
-          with g:
-            x1 = array_ops.gather(x, i)
-            outputs = nn.fused_batch_norm(
-                x1,
-                scale,
-                offset,
-                mean=mean,
-                variance=variance,
-                epsilon=0.01,
-                data_format=data_format,
-                is_training=is_training)
-            outputs = list(outputs)
-            # We only test the first value of outputs when is_training is False.
-            # It looks like CPU and GPU have different outputs for batch_mean
-            # and batch_variance for this case.
-            if not is_training:
-              outputs[1] = constant_op.constant(0.)
-              outputs[2] = constant_op.constant(0.)
-            loss = nn.l2_loss(outputs[0])
-          if is_training:
-            gradients = g.gradient(loss, [x1, scale, offset])
-          else:
-            gradients = [constant_op.constant(0.)] * 3
-          return outputs + gradients
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            with g:
+              x1 = array_ops.gather(x, i)
+              outputs = nn.fused_batch_norm(
+                  x1,
+                  scale,
+                  offset,
+                  mean=mean,
+                  variance=variance,
+                  epsilon=0.01,
+                  data_format=data_format,
+                  is_training=is_training)
+              outputs = list(outputs)
+              # We only test the first value of outputs when is_training is
+              # False. It looks like CPU and GPU have different outputs for
+              # batch_mean and batch_variance for this case.
+              if not is_training:
+                outputs[1] = constant_op.constant(0.)
+                outputs[2] = constant_op.constant(0.)
+              loss = nn.l2_loss(outputs[0])
+            if is_training:
+              gradients = g.gradient(loss, [x1, scale, offset])
+            else:
+              gradients = [constant_op.constant(0.)] * 3
+            return outputs + gradients
 
-        # pylint: enable=cell-var-from-loop
+          # pylint: enable=cell-var-from-loop
 
-        self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 6)
+          self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 6)
 
   def test_log_softmax(self):
     logits = random_ops.random_uniform([3, 2, 4])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 1799e7f51a0..240309deff7 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -993,7 +993,8 @@ class PForConfig(object):
 
   def _lookup_reduction(self, pl):
     """Lookups Placeholder `pl` in the reduction map."""
-    assert isinstance(pl, ops.Tensor)
+    msg = "Expected Tensor, got {} of type {}."
+    assert isinstance(pl, ops.Tensor), msg.format(pl, type(pl))
     return self._reduce_concat_map.get(pl, None)
 
 
@@ -1486,8 +1487,8 @@ def _channel_flatten_input(x, data_format):
   return outputs
 
 
-# Note that with training=True, running FusedBatchNorm on individual examples
-# is very different from running FusedBatchNorm on a batch of those examples.
+# Note that with training=True, running FusedBatchNormV3 on individual examples
+# is very different from running FusedBatchNormV3 on a batch of those examples.
 # This is because, for the latter case, the operation can be considered as first
 # computing the mean and variance over all the examples and then using these
 # to scale all those examples. This creates a data dependency between these
@@ -1496,7 +1497,7 @@ def _channel_flatten_input(x, data_format):
 # As with other kernels, the conversion here effectively runs the kernel
 # independently for each iteration, and returns outputs by stacking outputs from
 # each of those iterations.
-@RegisterPFor("FusedBatchNorm")
+@RegisterPFor("FusedBatchNormV3")
 def _convert_fused_batch_norm(pfor_input):
   is_training = pfor_input.get_attr("is_training")
   # When BatchNorm is used with training=False, mean and variance are provided
@@ -1525,8 +1526,8 @@ def _convert_fused_batch_norm(pfor_input):
 
   pfor_input.stack_inputs()
   data_format = pfor_input.get_attr("data_format")
-  # We merge the first dimension with the "C" dimension, run FusedBatchNorm, and
-  # then transpose back.
+  # We merge the first dimension with the "C" dimension, run FusedBatchNormV3,
+  # and then transpose back.
   x = pfor_input.stacked_input(0)
   x, reverse_order, reverse_shape = _channel_flatten_input(x, data_format)
   # Note that we stack all the other inputs as well so that they are the same
@@ -1548,7 +1549,7 @@ def _convert_fused_batch_norm(pfor_input):
   return [wrap(x, True) for x in outputs]
 
 
-@RegisterPFor("FusedBatchNormGrad")
+@RegisterPFor("FusedBatchNormGradV3")
 def _convert_fused_batch_norm_grad(pfor_input):
   pfor_input.stack_inputs()
   data_format = pfor_input.get_attr("data_format")
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index f2f1067600a..3c268d88cd5 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -8,10 +8,9 @@ package(
         "//nlp/projects/atc/tf/ops:__pkg__",
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 #-------------------------------------------------------------------------------
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index e6cea0e1090..b5d2efc2050 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -1860,7 +1860,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
   @property
   def _is_graph_tensor(self):
-    return hasattr(self._values, "graph")
+    return hasattr(self._row_splits, "graph")
 
   def consumers(self):
     return self._consumers()
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 5f753041f41..42773f3e8e0 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
@@ -150,10 +151,13 @@ def variable_handle_from_shape_and_dtype(
     # When in eager mode, explicitly ensure so here. When in graph mode, it's
     # ensured by always generating different variable names.
     exists = gen_resource_variable_ops.var_is_initialized_op(handle)
-    if exists:
-      raise ValueError("variable object with name '%s' already created. Use "
-                       "get_variable() if reuse is desired." %
-                       shared_name)
+
+    # We create an assert Op instead of checking right away in order to be
+    # compatible with ASYNC execution mode. Further, since not all devices
+    # support string tensors, we encode the assertion string in the Op name
+    gen_logging_ops._assert(  # pylint: disable=protected-access
+        math_ops.logical_not(exists), [exists], name="EagerVariableNameReuse")
+
     with context.graph_mode(), ops.Graph().as_default() as graph:
       h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
                                                   shared_name=shared_name,
@@ -360,7 +364,7 @@ class ResourceVariable(variables.VariableV1):
 
   def __init__(self,
                initial_value=None,
-               trainable=True,
+               trainable=None,
                collections=None,
                validate_shape=True,  # pylint: disable=unused-argument
                caching_device=None,
@@ -384,6 +388,7 @@ class ResourceVariable(variables.VariableV1):
       trainable: If `True`, the default, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
+         Defaults to `True` unless `synchronization` is set to `ON_READ`.
       collections: List of graph collections keys. The new variable is added to
         these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
       validate_shape: Ignored. Provided for compatibility with tf.Variable.
@@ -469,7 +474,7 @@ class ResourceVariable(variables.VariableV1):
 
   def _init_from_args(self,
                       initial_value=None,
-                      trainable=True,
+                      trainable=None,
                       collections=None,
                       caching_device=None,
                       name=None,
@@ -490,6 +495,7 @@ class ResourceVariable(variables.VariableV1):
       trainable: If `True`, the default, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
+        Defaults to `True` unless `synchronization` is set to `ON_READ`.
       collections: List of graph collections keys. The new variable is added to
         these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
       caching_device: Optional device string or function describing where the
@@ -1781,5 +1787,6 @@ def copy_to_graph_uninitialized(var):
   # pylint: enable=protected-access
   return new_variable
 
+ops.NotDifferentiable("Assert")
 ops.NotDifferentiable("VarIsInitializedOp")
 ops.NotDifferentiable("VariableShape")
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
index da2bf9c1d2d..9d0fe3a54b2 100644
--- a/tensorflow/python/ops/signal/BUILD
+++ b/tensorflow/python/ops/signal/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/ops/signal/fft_ops.py b/tensorflow/python/ops/signal/fft_ops.py
index 0cc29a343d2..efc2e5f4bb8 100644
--- a/tensorflow/python/ops/signal/fft_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
+from tensorflow.python.ops import manip_ops
 from tensorflow.python.framework import tensor_util as _tensor_util
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
@@ -325,6 +326,88 @@ def _irfft_grad_helper(rank, rfft_fn):
   return _grad
 
 
+@tf_export("signal.fftshift")
+def fftshift(x, axes=None, name=None):
+  """Shift the zero-frequency component to the center of the spectrum.
+
+  This function swaps half-spaces for all axes listed (defaults to all).
+  Note that ``y[0]`` is the Nyquist component only if ``len(x)`` is even.
+
+  @compatibility(numpy)
+  Equivalent to numpy.fft.fftshift.
+  https://docs.scipy.org/doc/numpy/reference/generated/numpy.fft.fftshift.html
+  @end_compatibility
+
+  For example:
+
+  ```python
+  x = tf.signal.fftshift([ 0.,  1.,  2.,  3.,  4., -5., -4., -3., -2., -1.])
+  x.numpy() # array([-5., -4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.])
+  ```
+
+  Args:
+    x: `Tensor`, input tensor.
+    axes: `int` or shape `tuple`, optional Axes over which to shift.  Default is
+      None, which shifts all axes.
+    name: An optional name for the operation.
+
+  Returns:
+    A `Tensor`, The shifted tensor.
+  """
+  with _ops.name_scope(name, "fftshift") as name:
+    x = _ops.convert_to_tensor(x)
+    if axes is None:
+      axes = tuple(range(x.shape.ndims))
+      shift = [int(dim // 2) for dim in x.shape]
+    elif isinstance(axes, int):
+      shift = int(x.shape[axes] // 2)
+    else:
+      shift = [int((x.shape[ax]) // 2) for ax in axes]
+
+    return manip_ops.roll(x, shift, axes, name)
+
+
+@tf_export("signal.ifftshift")
+def ifftshift(x, axes=None, name=None):
+  """The inverse of fftshift.
+
+  Although identical for even-length x,
+  the functions differ by one sample for odd-length x.
+
+  @compatibility(numpy)
+  Equivalent to numpy.fft.ifftshift.
+  https://docs.scipy.org/doc/numpy/reference/generated/numpy.fft.ifftshift.html
+  @end_compatibility
+
+  For example:
+
+  ```python
+  x = tf.signal.ifftshift([[ 0.,  1.,  2.],[ 3.,  4., -4.],[-3., -2., -1.]])
+  x.numpy() # array([[ 4., -4.,  3.],[-2., -1., -3.],[ 1.,  2.,  0.]])
+  ```
+
+  Args:
+    x: `Tensor`, input tensor.
+    axes: `int` or shape `tuple` Axes over which to calculate. Defaults to None,
+      which shifts all axes.
+    name: An optional name for the operation.
+
+  Returns:
+    A `Tensor`, The shifted tensor.
+  """
+  with _ops.name_scope(name, "ifftshift") as name:
+    x = _ops.convert_to_tensor(x)
+    if axes is None:
+      axes = tuple(range(x.shape.ndims))
+      shift = [-int(dim // 2) for dim in x.shape]
+    elif isinstance(axes, int):
+      shift = -int(x.shape[axes] // 2)
+    else:
+      shift = [-int(x.shape[ax] // 2) for ax in axes]
+
+    return manip_ops.roll(x, shift, axes, name)
+
+
 _ops.RegisterGradient("RFFT")(_rfft_grad_helper(1, irfft))
 _ops.RegisterGradient("IRFFT")(_irfft_grad_helper(1, rfft))
 _ops.RegisterGradient("RFFT2D")(_rfft_grad_helper(2, irfft2d))
diff --git a/tensorflow/python/ops/signal/signal.py b/tensorflow/python/ops/signal/signal.py
index cdc4d1c1911..f17a9fc0b3e 100644
--- a/tensorflow/python/ops/signal/signal.py
+++ b/tensorflow/python/ops/signal/signal.py
@@ -40,19 +40,21 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.python.ops.signal.dct_ops import dct
-from tensorflow.python.ops.signal.dct_ops import idct
 from tensorflow.python.ops.signal.fft_ops import fft
 from tensorflow.python.ops.signal.fft_ops import fft2d
 from tensorflow.python.ops.signal.fft_ops import fft3d
-from tensorflow.python.ops.signal.fft_ops import ifft
-from tensorflow.python.ops.signal.fft_ops import ifft2d
-from tensorflow.python.ops.signal.fft_ops import ifft3d
-from tensorflow.python.ops.signal.fft_ops import irfft
-from tensorflow.python.ops.signal.fft_ops import irfft2d
-from tensorflow.python.ops.signal.fft_ops import irfft3d
+from tensorflow.python.ops.signal.fft_ops import fftshift
 from tensorflow.python.ops.signal.fft_ops import rfft
 from tensorflow.python.ops.signal.fft_ops import rfft2d
 from tensorflow.python.ops.signal.fft_ops import rfft3d
+from tensorflow.python.ops.signal.dct_ops import idct
+from tensorflow.python.ops.signal.fft_ops import ifft
+from tensorflow.python.ops.signal.fft_ops import ifft2d
+from tensorflow.python.ops.signal.fft_ops import ifft3d
+from tensorflow.python.ops.signal.fft_ops import ifftshift
+from tensorflow.python.ops.signal.fft_ops import irfft
+from tensorflow.python.ops.signal.fft_ops import irfft2d
+from tensorflow.python.ops.signal.fft_ops import irfft3d
 from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
 from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
 from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index ab7df7261a0..e9b6be86642 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -331,6 +331,12 @@ def sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dims=False, name=None):
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_concat(inds, vals, shapes, axis, name=name))
 
+  shapes_value = [tensor_util.constant_value(shape) for shape in shapes]
+  if shapes_value and all(shape is not None for shape in shapes_value):
+    dim = sum(shape[axis] for shape in shapes_value)
+    output_shape = shapes_value[0]
+    output_shape[axis] = dim
+    output_shape = ops.convert_to_tensor(output_shape)
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index fbdf23717cc..d48a8b36f5a 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -28,8 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateful_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.tracking import \
-tracking
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util.tf_export import tf_export
 
 # A seed for random ops (stateful and stateless) will always be 1024
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 627ef54aa09..fcafec72cda 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -383,7 +383,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
 
   def __init__(self,
                initial_value=None,
-               trainable=True,
+               trainable=None,
                validate_shape=True,
                caching_device=None,
                name=None,
@@ -412,8 +412,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
         callable with no argument that returns the initial value when called. In
         that case, `dtype` must be specified. (Note that initializer functions
         from init_ops.py must first be bound to a shape before being used here.)
-      trainable: If `True`, the default, GradientTapes automatically watch uses
-        of this variable.
+      trainable: If `True`, GradientTapes automatically watch uses
+        of this variable. Defaults to `True` unless `synchronization` is
+        set to `ON_READ`.
       validate_shape: If `False`, allows the variable to be initialized with a
         value of unknown shape. If `True`, the default, the shape of
         `initial_value` must be known.
@@ -1361,7 +1362,7 @@ class VariableV1(Variable):
 
   def __init__(self,  # pylint: disable=super-init-not-called
                initial_value=None,
-               trainable=True,
+               trainable=None,
                collections=None,
                validate_shape=True,
                caching_device=None,
@@ -1393,9 +1394,10 @@ class VariableV1(Variable):
         callable with no argument that returns the initial value when called. In
         that case, `dtype` must be specified. (Note that initializer functions
         from init_ops.py must first be bound to a shape before being used here.)
-      trainable: If `True`, the default, also adds the variable to the graph
+      trainable: If `True`, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
+        Defaults to `True` unless `synchronization` is set to `ON_READ`.
       collections: List of graph collections keys. The new variable is added to
         these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
       validate_shape: If `False`, allows the variable to be initialized with a
@@ -1427,8 +1429,15 @@ class VariableV1(Variable):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       use_resource: whether to use resource variables.
-      synchronization: unused
-      aggregation: unused
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
       shape: (optional) The shape of this variable. If None, the shape of
         `initial_value` will be used. When setting this argument to
         `tf.TensorShape(None)` (representing an unspecified shape), the variable
@@ -1450,7 +1459,7 @@ class RefVariable(VariableV1):
 
   def __init__(self,  # pylint: disable=super-init-not-called
                initial_value=None,
-               trainable=True,
+               trainable=None,
                collections=None,
                validate_shape=True,
                caching_device=None,
@@ -1481,9 +1490,10 @@ class RefVariable(VariableV1):
         callable with no argument that returns the initial value when called. In
         that case, `dtype` must be specified. (Note that initializer functions
         from init_ops.py must first be bound to a shape before being used here.)
-      trainable: If `True`, the default, also adds the variable to the graph
+      trainable: If `True`, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
+        Defaults to `True` unless `synchronization` is set to `ON_READ`.
       collections: List of graph collections keys. The new variable is added to
         these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
       validate_shape: If `False`, allows the variable to be initialized with a
@@ -1568,7 +1578,7 @@ class RefVariable(VariableV1):
 
   def _init_from_args(self,
                       initial_value=None,
-                      trainable=True,
+                      trainable=None,
                       collections=None,
                       validate_shape=True,
                       caching_device=None,
@@ -1588,9 +1598,10 @@ class RefVariable(VariableV1):
         callable with no argument that returns the initial value when called.
         (Note that initializer functions from init_ops.py must first be bound
          to a shape before being used here.)
-      trainable: If `True`, the default, also adds the variable to the graph
+      trainable: If `True`, also adds the variable to the graph
         collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
         the default list of variables to use by the `Optimizer` classes.
+        Defaults to `True` unless `synchronization` is set to `ON_READ`.
       collections: List of graph collections keys. The new variable is added to
         these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
       validate_shape: If `False`, allows the variable to be initialized with a
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 2db28ecda66..e8d1aa13667 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -217,11 +217,16 @@ def while_loop(cond,
 
     with ops.control_dependencies(
         list(cond_graph.control_captures) + list(body_graph.control_captures)):
+      output_shapes = [t.shape for t in body_graph.outputs]
+      orig_loop_vars_range = slice(first_loop_var_index,
+                                   first_loop_var_index + num_flattened_outputs)
+      output_shapes[orig_loop_vars_range] = nest.flatten(
+          shape_invariants, expand_composites=True)[orig_loop_vars_range]
       outputs = gen_functional_ops._while(
           flattened_loop_vars,
           util.create_new_tf_function(cond_graph),
           util.create_new_tf_function(body_graph),
-          output_shapes=[t.shape for t in body_graph.outputs],
+          output_shapes=output_shapes,
           parallel_iterations=parallel_iterations,
           name=scope)
 
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index 2e06f26fa4c..42a0ff57568 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -18,6 +18,7 @@ limitations under the License.
 %{
   #include <memory>
   #include <vector>
+  #include "tensorflow/c/tf_status.h"
   #include "tensorflow/core/platform/types.h"
   using tensorflow::uint64;
   using tensorflow::string;
@@ -230,6 +231,8 @@ _COPY_TYPEMAPS(unsigned int, mode_t);
 %define override %enddef
 #endif
 
+%include "tensorflow/c/tf_status.h"
+
 // Typemaps to automatically raise a Python exception from bad output TF_Status.
 // TODO(b/77295559): expand this to all TF_Status* output params and deprecate
 // raise_exception_on_not_ok_status (currently it only affects the C API).
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index f2796e43989..52a7ec1d214 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 7f24d7ffe5b..c4212dbe4ea 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//tensorflow/python/profiler:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow/python/profiler:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index ce41813ff32..deb85122629 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 %include "tensorflow/python/platform/base.i"
+%include "tensorflow/c/tf_datatype.h"
+%include "tensorflow/c/tf_status.h"
 
 %ignore "";
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index b76aab8298c..da76bd252b2 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -5,10 +5,9 @@ package(
     # TODO(drpng): change that to //third_party/tensorflow:internal
     # when we have migrated all users.
     default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
@@ -397,6 +396,7 @@ cuda_py_test(
     ],
     shard_count = 10,
     tags = ["no_mac"],  # TODO(b/124822121): Re-enable this test.
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index 8c64413a42c..387efef5426 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
@@ -489,5 +490,29 @@ class LoadTest(test.TestCase):
     root = load.load(path)
     self.assertFalse(root.variables[0].trainable)
 
+  def _model_with_sparse_output(self):
+    """Generate a graph with a SparseTensor output and serialize in V1 format"""
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      in_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[1])
+      out_sparse_tensor = sparse_tensor.SparseTensor(
+          indices=[[0]], values=in_placeholder, dense_shape=[1]) * 2
+      with session_lib.Session() as session:
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": in_placeholder},
+            outputs={"output": out_sparse_tensor})
+    return path
+
+  def test_load_sparse_outputs(self):
+    path = self._model_with_sparse_output()
+    imported = load.load(path)
+    imported_fn = imported.signatures["serving_default"]
+    forty_two = constant_op.constant([42], dtype=dtypes.int64)
+    self.assertEqual([84], imported_fn(forty_two)["output"].values.numpy())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 4e3869b1aaa..dfe552659b4 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -311,7 +311,9 @@ class SavedModelLoader(object):
       RuntimeError: if no metagraphs were found with the associated tags.
     """
     found_match = False
+    available_tags = []
     for meta_graph_def in self._saved_model.meta_graphs:
+      available_tags.append(set(meta_graph_def.meta_info_def.tags))
       if set(meta_graph_def.meta_info_def.tags) == set(tags):
         meta_graph_def_to_load = meta_graph_def
         found_match = True
@@ -322,7 +324,7 @@ class SavedModelLoader(object):
           "MetaGraphDef associated with tags " + str(tags).strip("[]") +
           " could not be found in SavedModel. To inspect available tag-sets in"
           " the SavedModel, please use the SavedModel CLI: `saved_model_cli`"
-      )
+          "\navailable_tags: " + str(available_tags))
     return meta_graph_def_to_load
 
   def load_graph(self, graph, tags, import_scope=None, **saver_kwargs):
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index 7c2a5d04b24..58d7456ed63 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -17,9 +17,10 @@
 #   Keras saving and loading libraries.
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 602e2e305eb..f73276ff407 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -630,7 +630,7 @@ def save(obj, export_dir, signatures=None):
   Example usage:
 
   ```python
-  class Adder(tf.train.Checkpoint):
+  class Adder(tf.Module):
 
     @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
     def add(self, x):
@@ -747,17 +747,6 @@ def save(obj, export_dir, signatures=None):
   handled automatically, such as when the exported model contains operations
   which the consumer does not have definitions for.
 
-  The current implementation of `tf.saved_model.save` targets serving use-cases,
-  but omits information which will be necessary for the planned future
-  implementation of `tf.saved_model.load`. Exported models using the current
-  `save` implementation, and other existing SavedModels, will not be compatible
-  with `tf.saved_model.load` when it is implemented. Further, `save` will in the
-  future attempt to export `@tf.function`-decorated methods which it does not
-  currently inspect, so some objects which are exportable today will raise
-  exceptions on export in the future (e.g. due to complex/non-serializable
-  default arguments). Such backwards-incompatible API changes are expected only
-  prior to the TensorFlow 2.0 release.
-
   Args:
     obj: A trackable object to export.
     export_dir: A directory in which to write the SavedModel.
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 14cb5abc075..b412fa6f145 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -171,11 +171,6 @@ class SaveTest(test.TestCase):
         input_signature=([tensor_spec.TensorSpec(None, dtypes.float32),
                           tensor_spec.TensorSpec(None, dtypes.float32)],))
     root.f([constant_op.constant(1.), constant_op.constant(1.)])
-    # Concrete functions must always have uniquely named Tensor inputs. Save
-    # relies on this.
-    with self.assertRaisesRegexp(
-        ValueError, "two arguments named 'x'"):
-      root.f.get_concrete_function()
 
   def test_nested_outputs(self):
     root = tracking.AutoTrackable()
@@ -267,7 +262,7 @@ class SaveTest(test.TestCase):
 
   def test_docstring(self):
 
-    class Adder(util.Checkpoint):
+    class Adder(module.Module):
 
       @def_function.function(input_signature=[tensor_spec.TensorSpec(
           shape=None, dtype=dtypes.float32)])
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 8e1ccb6d458..3aa67dea910 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Tools for manipulating TensorFlow graphs.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE"])
 
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index 59f20d52a0c..7cbcac28a0d 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -1,7 +1,9 @@
 # Description:
 # Scripts used to generate TensorFlow Python API.
 
-licenses(["notice"])  # Apache 2.0
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "TENSORFLOW_API_INIT_FILES")
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 171d44c1fee..1ed76a22d1e 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -154,6 +154,7 @@ py_library(
         "device_assignment.py",
         "session_support.py",
         "tensor_tracer.py",
+        "tensor_tracer_flags.py",
         "topology.py",
         "tpu.py",
         "tpu_feed.py",
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index ddfc26eeea8..3f503f9c3fc 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -1,9 +1,8 @@
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [
         "//tensorflow:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 py_library(
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index f291e7e95f7..a833bfa2c49 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import os
 import os.path
-import re
 import sys
 
 from tensorflow.python.framework import constant_op
@@ -40,22 +39,14 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tensor_tracer_flags
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu.ops import tpu_ops
 
 _TRACER_LOG_PREFIX = ' [>>>TT>>>]'
 _DEVICE_TYPE_TPU = 'tpu'
 _DEVICE_TYPE_CPU = 'cpu'
-_TRACE_MODE_NAN_INF = 'nan-inf'
-_TRACE_MODE_PART_TENSOR = 'part-tensor'
 _TRACE_MODE_PART_TENSOR_SIZE = 3
-_TRACE_MODE_FULL_TENSOR = 'full-tensor'
-_TRACE_MODE_FULL_IF_NAN = 'trace-back-if-nan'
-_FLAG_NAME_TRACE_STACK_SIZE = 'trace_stack_size'
-_TRACE_MODE_NORM = 'norm'
-_TRACE_MODE_MAX_ABS = 'max-abs'
-_SUBMODE_BRIEF = 'brief'
-_SUBMODE_DETAILED = 'detailed'
 _REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
 _REASON_UNSAFE_OP = 'not-traced-unsafe-op'
 _REASON_WHILELOOP_OP = 'not-traced-special-whileloop-op'
@@ -90,34 +81,7 @@ _FIELD_NAME_NUM_OPS = 'number-of-ops:'
 _FIELD_NAME_NUM_TENSORS = 'number-of-tensors:'
 _FIELD_NAME_NUM_CACHE_INDICES = 'number-of-indices:'
 _FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
-_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
-_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
-_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
-_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
-_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
-_FLAG_NAME_ENABLE = 'enable'
-_FLAG_NAME_TRACE_MODE = 'trace_mode'
-_FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
-_FLAG_NAME_TRACE_SCALAR_OPS = 'trace_scalar'
-_FLAG_NAME_TRACE_BEFORE_OPS = 'trace_before_included_ops'
-_FLAG_NAME_TRACE_AFTER_OPS = 'trace_after_included_ops'
-_FLAG_NAME_SUBMODE = 'submode'
-_FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
-_FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
-_FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
-_FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
-_FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
-_FLAG_NAME_INCLUDED_CORES = 'included_cores'
-_FLAG_NAME_TRACE_DIR = 'trace_dir'
-_FLAG_NAME_REPORT_FILE = 'report_file'
-_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
-_FLAG_NAME_OP_RANGE = 'op_range'
-# Folder to dump the pre (before tensor tracer updates) and post graphs (after
-# tensor tracer updates).
-_FLAG_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
-_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
 _OUTPUT_STREAM_ESCAPE = 'file://'
-_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
 _TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
 _TENSOR_TRACER_CHECKPOINT = 'tensor_tracer_checkpoint'
 _TRACE_FILE_NAME = 'trace.all'
@@ -227,7 +191,6 @@ def _create_tensor_values_cache(graph, num_tensors):
         use_resource=True,
         collections=[_TENSOR_TRACER_STORAGE, ops.GraphKeys.LOCAL_VARIABLES])
 
-
 class TensorTracer(object):
   """A software construct for tracing tensor values in a TF graph on TPU.
 
@@ -247,233 +210,10 @@ class TensorTracer(object):
   # The set of graphs that are rewritten by tensor tracer.
   _traced_graphs = set()
 
-  @staticmethod
-  def _match_next_flag(flags, pos):
-    """Returns the match for the next TensorTracer flag.
-
-    Args:
-       flags: a string that contains the flags.
-       pos: where in flags to start the search.
-
-    Returns:
-       A pair where the first element is the regular-expression
-       match found and the second element indicates if the match
-       has a value.
-    """
-
-    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match, True
-    match = _FLAG_NO_EQUAL_PAT.match(flags, pos)
-    if match:
-      # The flag is found but is not given a value.
-      return match, False
-    # The flag is not found.
-    return None, False
-
-  @staticmethod
-  def validate_flag_names():
-    """Validates if the TensorTrace flags passed are valid."""
-    valid_flag_names = [
-        _FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE, _FLAG_NAME_USE_COMPACT_TRACE,
-        _FLAG_NAME_TRACE_SCALAR_OPS, _FLAG_NAME_TRACE_BEFORE_OPS,
-        _FLAG_NAME_TRACE_AFTER_OPS, _FLAG_NAME_TRACE_STACK_SIZE,
-        _FLAG_NAME_SUBMODE, _FLAG_NAME_EXCLUDED_OPNAMES,
-        _FLAG_NAME_EXCLUDED_OPTYPES, _FLAG_NAME_INCLUDED_OPNAMES,
-        _FLAG_NAME_INCLUDED_OPTYPES, _FLAG_NAME_TRACE_DIR,
-        _FLAG_NAME_INCLUDED_CORES, _FLAG_NAME_REPORT_FILE,
-        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
-        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, _FLAG_NAME_OP_RANGE,
-        _FLAG_DUMP_BEFORE_AFTER_GRAPHS
-    ]
-    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return
-    pos = 0
-    while True:
-      match, _ = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
-      if not match:
-        break
-      flag_name = match.group(1)
-      if flag_name not in valid_flag_names:
-        raise ValueError(
-            'The flag name "%s" passed via the environment variable "%s" '
-            'is invalid. Valid flag names are:'
-            '\n%s'%(flag_name, _FLAGS_ENV_VAR, valid_flag_names))
-      pos = match.end()
-
-  @staticmethod
-  def print_flag_values():
-    """Prints all TensorTracer flags passed via environment variables."""
-
-    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
-    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
-                                                   tensor_tracer_flags)
-    result += 'Individual flag value:\n'
-    pos = 0
-    while True:
-      match, has_value = TensorTracer._match_next_flag(
-          tensor_tracer_flags, pos)
-      if not match:
-        break
-      flag_name = match.group(1)
-      if has_value:
-        flag_value = match.group(2)
-      else:
-        flag_value = None
-      result += '  %s: %s\n'%(flag_name, flag_value)
-      pos = match.end()
-    result += '\n'
-    return result
-
-  @staticmethod
-  def flag_value_as_int_list(wanted_flag_name):
-    """Returns the integer list of a TensorTracer flag.
-
-    Args:
-      wanted_flag_name: the name of the flag we are looking for.
-
-    Returns:
-      the value of the flag.
-    Raises:
-      RuntimeError: If supposedly deadcode is reached.
-    """
-    int_list = []
-    found, flag_value = TensorTracer.get_flag_value(wanted_flag_name)
-
-    if found:
-      try:
-        integer_values = flag_value.split(',')
-        int_list = [int(int_val) for int_val in integer_values]
-      except ValueError:
-        logging.warning('Cannot convert %s to int for flag %s', int_list,
-                        wanted_flag_name)
-    return int_list
-
-  @staticmethod
-  def get_flag_int_value(wanted_flag_name, default_value):
-    """Returns the int value of a TensorTracer flag.
-
-    Args:
-      wanted_flag_name: the name of the flag we are looking for.
-      default_value: the default value for the flag, if not provided.
-    Returns:
-      the value of the flag.
-    Raises:
-      RuntimeError: If supposedly deadcode is reached.
-    """
-    flag_int_value = default_value
-    found, flag_value = TensorTracer.get_flag_value(wanted_flag_name)
-
-    if found:
-      try:
-        flag_int_value = int(flag_value)
-      except ValueError:
-        logging.warning('Cannot convert %s to int for flag %s' % (
-            flag_int_value, wanted_flag_name))
-    return flag_int_value
-
-  @staticmethod
-  def get_flag_value(wanted_flag_name):
-    """Returns the value of a TensorTracer flags.
-
-    Args:
-      wanted_flag_name: the name of the flag we are looking for.
-
-    Returns:
-      A pair where the first element indicates if the flag is
-      found and the second element is the value of the flag.
-
-    Raises:
-      RuntimeError: If supposedly deadcode is reached.
-    """
-
-    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return False, None
-    pos = 0
-    while True:
-      match, has_value = TensorTracer._match_next_flag(
-          tensor_tracer_flags, pos)
-      if not match:
-        return False, None
-      flag_name = match.group(1)
-      if has_value:
-        flag_value = match.group(2)
-      else:
-        flag_value = None
-      if flag_name == wanted_flag_name:
-        return True, flag_value
-      pos = match.end()
-    raise RuntimeError('Should not reach here.')
-
-  @staticmethod
-  def flag_value_to_re_list(flag_name):
-    """Converts list of strings to compiled RE."""
-
-    re_list = []
-    found, flag_value = TensorTracer.get_flag_value(flag_name)
-    if not found or not flag_value:
-      return re_list
-    list_of_values = flag_value.split()
-    for v in list_of_values:
-      r = re.compile(v)
-      re_list.append(r)
-    return re_list
-
-  @staticmethod
-  def _is_flag_on(flag_name):
-    """Returns True if the given flag is on."""
-
-    found, flag_value = TensorTracer.get_flag_value(flag_name)
-    if not found:
-      return False
-    if flag_value is None:
-      return True
-    # Depends on the flag value.
-    flag_value = flag_value.lower()
-    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
-    return enabled
-
   @staticmethod
   def is_enabled():
     """Returns True if TensorTracer is enabled."""
-
-    if TensorTracer._is_flag_on(_FLAG_NAME_ENABLE):
-      logging.info('Tensor Tracer is enabled with flags %s.' %
-                   os.getenv(_FLAGS_ENV_VAR))
-      return True
-    else:
-      return False
-
-  @staticmethod
-  def use_test_undeclared_outputs_dir():
-    """Decides the output directory of the report and trace files.
-
-    Args:
-       None.
-
-    Returns:
-       True if the output files should be written to the
-       test-undeclared-outputs-directory defined via an
-       env variable.
-    """
-
-    return TensorTracer._is_flag_on(
-        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
-
-  @staticmethod
-  def use_compact_trace():
-    return TensorTracer._is_flag_on(
-        _FLAG_NAME_USE_COMPACT_TRACE)
+    return tensor_tracer_flags.TTParameters().is_enabled()
 
   @staticmethod
   def check_device_type(device_type):
@@ -482,31 +222,6 @@ class TensorTracer(object):
     if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
       raise ValueError('Invalid device_type "%s"'%device_type)
 
-  @staticmethod
-  def check_trace_mode(trace_mode):
-    """Checks if the given trace mode is valid."""
-
-    valid_trace_modes = [
-        _TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR, _TRACE_MODE_FULL_TENSOR,
-        _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS, _TRACE_MODE_FULL_IF_NAN
-    ]
-    if trace_mode not in valid_trace_modes:
-      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
-                       'Valid trace modes are: %s'%(trace_mode,
-                                                    valid_trace_modes))
-
-  @staticmethod
-  def check_submode(submode):
-    """Checks if the given submode is valid."""
-
-    if not submode:
-      return
-    valid_submodes = [_SUBMODE_DETAILED, _SUBMODE_BRIEF]
-    if submode not in valid_submodes:
-      raise ValueError('Invalid submode "%s" given to the Tensor_Tracer.'
-                       'Valid submodes are: %s'%(submode,
-                                                 valid_submodes))
-
   @staticmethod
   def loop_cond_op(op):
     return op.type in ('LoopCond', 'RefLoopCond')
@@ -569,14 +284,10 @@ class TensorTracer(object):
       return True
     return False
 
-  @staticmethod
-  def less_interesting_op(op):
-    """Returns True if the given Op is not an interesting one to be traced."""
-
-    found, _ = TensorTracer.get_flag_value(
-        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
-    if found:
-      # users force to include all ops.
+  def _less_interesting_op(self, op):
+    """Returns True if the given op is not an interesting one to be traced."""
+    # If flag is set to include less interesting ops, then include everything.
+    if self._parameters.include_less_interesting_ops:
       return False
     # Following ops are highly unlikey to cause bugs.
     return op.type in ['Const', 'Identity', 'Cast', 'Shape']
@@ -680,67 +391,22 @@ class TensorTracer(object):
           tensorname_idx_map[output_tensor.name] = len(tensor_list)-1
     return (opname_idx_map, tensor_list, tensorname_idx_map)
 
-  @staticmethod
-  def is_conditional_trace_mode(trace_mode):
-    return trace_mode == _TRACE_MODE_FULL_IF_NAN
-
   def __init__(self):
     """Initializes a TensorTracer.
 
     Sets the various member fields from the flags (if given) or the defaults.
     """
+    self._parameters = tensor_tracer_flags.TTParameters()
+    self._set_report_file()
     self._version = 'use-outside-compilation'
     self._device_type = None
-    TensorTracer.validate_flag_names()
-    found, self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
-    if not found or not self._trace_mode:
-      self._trace_mode = _TRACE_MODE_NAN_INF
-    TensorTracer.check_trace_mode(self._trace_mode)
-    found, self._submode = TensorTracer.get_flag_value(_FLAG_NAME_SUBMODE)
-    if not found or not self._submode:
-      self._submode = _SUBMODE_DETAILED
-    TensorTracer.check_submode(self._submode)
     self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
     self._instrument_records = {}
-    self._set_trace_dir()
-    self._set_report_file()
-    self._set_op_range()
-    self._set_excluded_opnames()
-    self._set_excluded_optypes()
-    self._set_included_opnames()
-    self._set_included_optypes()
     self._num_replicas = None
     self._num_replicas_per_host = None
     self._num_hosts = None
     self._replica_id = None
     self._included_op_full_names = set()
-    self._is_conditional_trace = TensorTracer.is_conditional_trace_mode(
-        self._trace_mode)
-    self._trace_scalar_ops = TensorTracer._is_flag_on(
-        _FLAG_NAME_TRACE_SCALAR_OPS)
-
-    # _trace_ops_before_included and _trace_ops_after_included denotes to depth
-    # of tracing relative to the ops given in --included_opnames or
-    # --included_optypes
-    # For example, in the below graph
-    #                op1 --> op2 --> op3 --> op4 --> op5
-    # If --included_opnames=op3 then only op3 will be traced.
-    # If also --trace_before_included_ops=2 (_trace_ops_before_included), then
-    # op1 and op2 will be traced as they are at most 2 hops apart from an
-    # included op. Similarly, if --trace_after_included_ops=2, then op4 and op5
-    # will also be traced.
-    self._trace_ops_before_included = TensorTracer.get_flag_int_value(
-        _FLAG_NAME_TRACE_BEFORE_OPS, 0)
-    self._trace_ops_after_included = TensorTracer.get_flag_int_value(
-        _FLAG_NAME_TRACE_AFTER_OPS, 0)
-    self._trace_stack_size = TensorTracer.get_flag_int_value(
-        _FLAG_NAME_TRACE_STACK_SIZE, 1)
-
-    _, self._graph_dump_path = TensorTracer.get_flag_value(
-        _FLAG_DUMP_BEFORE_AFTER_GRAPHS)
-
-    self._included_cores = TensorTracer.flag_value_as_int_list(
-        _FLAG_NAME_INCLUDED_CORES)
 
   def _add_replica_id_to_graph(self):
     """Adds nodes for computing the replica ID to the graph."""
@@ -754,35 +420,13 @@ class TensorTracer(object):
     else:
       self._replica_id = 'unknown'
 
-  def _set_trace_dir(self):
-    found, self._trace_dir = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_DIR)
-    if found and self._trace_dir \
-       and TensorTracer.use_test_undeclared_outputs_dir():
-      raise ValueError('Cannot not use --%s and --%s at the same time'
-                       %(_FLAG_NAME_TRACE_DIR,
-                         _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR))
-    if TensorTracer.use_test_undeclared_outputs_dir():
-      self._trace_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
-
   def _set_report_file(self):
     """Sets the path of the output report file."""
-
-    found, self._report_file_path = TensorTracer.get_flag_value(
-        _FLAG_NAME_REPORT_FILE)
-    if found and self._report_file_path \
-       and TensorTracer.use_test_undeclared_outputs_dir():
-      if os.path.isabs(self._report_file_path):
-        raise ValueError('If use_test_undeclared_outputs_dir is set,'
-                         'report_file_path cannot be an absolute path (%s)'
-                         %self._report_file_path)
-      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
-      self._report_file_path = os.path.join(outputs_dir,
-                                            self._report_file_path)
-    if not self._report_file_path:
+    if not self._parameters.report_file_path:
       self._report_file = None
       return
     try:
-      self._report_file = gfile.Open(self._report_file_path, 'w')
+      self._report_file = gfile.Open(self._parameters.report_file_path, 'w')
     except IOError as e:
       raise e
 
@@ -790,41 +434,13 @@ class TensorTracer(object):
     if self._report_file:
       self._report_file.close()
 
-  def _set_op_range(self):
-    """Sets the index range of the Ops that we will consider tracing."""
-
-    found, op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
-    if not found or not op_range:
-      self._op_range = (-1, -1)  # this means including all ops.
-      return
-    match = _OP_RANGE_PAT.match(op_range)
-    if not match:
-      self._op_range = (-1, -1)  # this means including all ops.
-      return
-    self._op_range = (int(match.group(1)), int(match.group(2)))
-
   def _inside_op_range(self, idx):
     """Return True if the given index is inside the selected range."""
 
-    if idx < self._op_range[0]:
+    if idx < self._parameters.op_range[0]:
       return False
-    return self._op_range[1] < 0 or idx <= self._op_range[1]
-
-  def _set_excluded_opnames(self):
-    self._excluded_opname_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_EXCLUDED_OPNAMES)
-
-  def _set_excluded_optypes(self):
-    self._excluded_optype_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_EXCLUDED_OPTYPES)
-
-  def _set_included_opnames(self):
-    self._included_opname_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_INCLUDED_OPNAMES)
-
-  def _set_included_optypes(self):
-    self._included_optype_re_list = TensorTracer.flag_value_to_re_list(
-        _FLAG_NAME_INCLUDED_OPTYPES)
+    return (self._parameters.op_range[1] < 0 or
+            idx <= self._parameters.op_range[1])
 
   def _is_user_included_op(self, op):
     """Checks whether the op is included in the tensor tracer flags.
@@ -844,12 +460,12 @@ class TensorTracer(object):
       """Helper function to check if op is included or not."""
       if op.name in self._included_op_full_names:
         return True
-      for opname_re in self._included_opname_re_list:
+      for opname_re in self._parameters.included_opname_re_list:
         if opname_re.match(op.name):
           self._included_op_full_names.add(op.name)
           return True
 
-      for optype_re in self._included_optype_re_list:
+      for optype_re in self._parameters.included_optype_re_list:
         if optype_re.match(op.type):
           self._included_op_full_names.add(op.name)
           return True
@@ -870,15 +486,15 @@ class TensorTracer(object):
       return False
     # check_after and check_before are swapped below, as below operation
     # checks the distance from an arbitrary op to included ops.
-    return _is_op_or_any_neighbor_included(op,
-                                           self._trace_ops_after_included,
-                                           self._trace_ops_before_included)
+    return _is_op_or_any_neighbor_included(
+        op, self._parameters.trace_ops_after_included,
+        self._parameters.trace_ops_before_included)
 
   def _is_user_excluded_op(self, op):
-    for opname_re in self._excluded_opname_re_list:
+    for opname_re in self._parameters.excluded_opname_re_list:
       if opname_re.match(op.name):
         return True
-    for optype_re in self._excluded_optype_re_list:
+    for optype_re in self._parameters.excluded_optype_re_list:
       if optype_re.match(op.type):
         return True
     return False
@@ -886,14 +502,15 @@ class TensorTracer(object):
   def _use_tensor_values_cache(self):
     """Returns True if immediate tensors should be first saved to a cache."""
 
-    if self._trace_mode not in set([_TRACE_MODE_NAN_INF,
-                                    _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS]):
+    if self._parameters.trace_mode not in set([
+        tensor_tracer_flags.TRACE_MODE_NAN_INF,
+        tensor_tracer_flags.TRACE_MODE_NORM,
+        tensor_tracer_flags.TRACE_MODE_MAX_ABS]):
       return False
-    if self._trace_dir and _trace_files_need_precreated(self._trace_dir):
+    if (self._parameters.trace_dir and
+        _trace_files_need_precreated(self._parameters.trace_dir)):
       return True
-    if TensorTracer.use_compact_trace():
-      return True
-    return False
+    return self._parameters.use_compact_trace
 
   def _save_tensor_value_to_cache_op(self, graph, cache_idx, updates):
     """Returns an Op that will save the given updates to an entry in the cache."""
@@ -917,11 +534,13 @@ class TensorTracer(object):
     self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
     self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
     self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
-    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
-    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE, self._submode))
-    if self._included_cores:
+    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE,
+                                  self._parameters.trace_mode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE,
+                                  self._parameters.submode))
+    if self._parameters.included_cores:
       self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
-                                    len(self._included_cores)))
+                                    len(self._parameters.included_cores)))
     else:
       self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
                                     self._num_replicas))
@@ -1061,20 +680,24 @@ class TensorTracer(object):
       is_nan_producer = math_ops.reduce_any(is_nan_producer > 0)
       return is_nan_producer
 
-    if self._trace_mode == _TRACE_MODE_FULL_IF_NAN:
+    if (self._parameters.trace_mode ==
+        tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
       return _detect_inf_nan_producer(tensor)
-    if self._trace_mode == _TRACE_MODE_NAN_INF:
+    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
       return _detect_nan_inf(tensor)
-    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+    if (self._parameters.trace_mode ==
+        tensor_tracer_flags.TRACE_MODE_PART_TENSOR):
       return tensor
-    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+    if (self._parameters.trace_mode ==
+        tensor_tracer_flags.TRACE_MODE_FULL_TENSOR):
       return tensor
-    if self._trace_mode == _TRACE_MODE_NORM:
+    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NORM:
       return _show_norm(tensor)
-    if self._trace_mode == _TRACE_MODE_MAX_ABS:
+    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_MAX_ABS:
       return _show_max_abs(tensor)
     raise RuntimeError(
-        'Tensor trace fun for %s is not yet implemented' % self._trace_mode)
+        'Tensor trace fun for %s is not yet implemented'
+        % self._parameters.trace_mode)
 
   def _make_tensor_trace_fun(self, tensor_name):
     """Makes the tensor tracing function called by outside compilation.
@@ -1106,7 +729,7 @@ class TensorTracer(object):
                     self._tensorname_idx_map.
       """
 
-      if self._submode == _SUBMODE_BRIEF:
+      if self._parameters.is_brief_mode():
         if tensor_name not in self._tensorname_idx_map:
           raise ValueError(
               'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
@@ -1114,8 +737,8 @@ class TensorTracer(object):
       else:
         msg = '"%s"'%tensor_name
 
-      if self._trace_dir:
-        output_path = os.path.join(self._trace_dir, _TRACE_FILE_NAME)
+      if self._parameters.trace_dir:
+        output_path = os.path.join(self._parameters.trace_dir, _TRACE_FILE_NAME)
         output_stream = _OUTPUT_STREAM_ESCAPE + output_path
       else:
         output_stream = sys.stderr
@@ -1159,27 +782,31 @@ class TensorTracer(object):
             visited_tensors[input_tensor] = distance + 1
         return visitor_queue
 
-      tensors_to_print = _get_distance_k_tensors(self._trace_stack_size)
+      tensors_to_print = _get_distance_k_tensors(
+          self._parameters.trace_stack_size)
       print_ops = [_print_tensor(t.name, -1, t, t) for t in tensors_to_print]
       with ops.control_dependencies(print_ops):
         return constant_op.constant(True)
 
-    if self._trace_mode == _TRACE_MODE_FULL_IF_NAN:
+    if (self._parameters.trace_mode ==
+        tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
       return _show_full_tensors
-    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+    if (self._parameters.trace_mode ==
+        tensor_tracer_flags.TRACE_MODE_PART_TENSOR):
       return _show_part_tensor
-    # The input tensor has a shape of "[1]" for _TRACE_MODE_NAN_INF,
-    # _TRACE_MODE_NORM, and _TRACE_MODE_MAX_ABS, as related computations are
+    # The input tensor has a shape of "[1]" for TRACE_MODE_NAN_INF,
+    # TRACE_MODE_NORM, and TRACE_MODE_MAX_ABS, as related computations are
     # performed within TPUs and only their results are transferred to CPU.
     # Simply, print the full tensor for these trace modes.
-    if self._trace_mode in [
-        _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_FULL_TENSOR,
-        _TRACE_MODE_MAX_ABS
-    ]:
+    if self._parameters.trace_mode in [
+        tensor_tracer_flags.TRACE_MODE_NAN_INF,
+        tensor_tracer_flags.TRACE_MODE_NORM,
+        tensor_tracer_flags.TRACE_MODE_FULL_TENSOR,
+        tensor_tracer_flags.TRACE_MODE_MAX_ABS]:
       return _show_full_tensor
 
     raise RuntimeError('Tensor trace fun for %s is not yet implemented'
-                       %self._trace_mode)
+                       %self._parameters.trace_mode)
 
   def _skip_op(self, op_id, op, user_included, user_excluded,
                in_exec_path=True):
@@ -1206,7 +833,7 @@ class TensorTracer(object):
       self._instrument_records[op.name] = TensorTracer.reason(
           op_id, _REASON_OUTSIDE_OP_RANGE)
       return True
-    if TensorTracer.less_interesting_op(op):
+    if self._less_interesting_op(op):
       self._instrument_records[op.name] = TensorTracer.reason(
           op_id, _REASON_LESS_INTERESTING_OP)
       return True
@@ -1251,8 +878,10 @@ class TensorTracer(object):
     if not out_tensor.get_shape().is_fully_defined():
       # If trace mode is nan-inf, norm or max, then the tensor will be reduced
       # to a scalar before the outside compilation call.
-      if self._trace_mode in [
-          _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS
+      if self._parameters.trace_mode in [
+          tensor_tracer_flags.TRACE_MODE_NAN_INF,
+          tensor_tracer_flags.TRACE_MODE_NORM,
+          tensor_tracer_flags.TRACE_MODE_MAX_ABS
       ]:
         self._instrument_records[out_tensor.name] = TensorTracer.reason(
             op_id, _REASON_TENSOR_GET_TRACED)
@@ -1264,7 +893,7 @@ class TensorTracer(object):
     rank = len(out_tensor.shape)
     if rank < 1:
       # scalar
-      if self._trace_scalar_ops:
+      if self._parameters.trace_scalar_ops:
         if TensorTracer.unsafe_scalar_trace(out_tensor.op):
           self._instrument_records[out_tensor.name] = TensorTracer.reason(
               op_id, _REASON_UNSAFE_SCALAR)
@@ -1362,23 +991,23 @@ class TensorTracer(object):
   def _check_trace_files(self):
     """Checks if any requirements for trace files are satisfied."""
 
-    if not self._trace_dir:
+    if not self._parameters.trace_dir:
       # traces will be written to stderr. No need to check trace files.
       return
-    if _trace_files_need_precreated(self._trace_dir):
+    if _trace_files_need_precreated(self._parameters.trace_dir):
       for replica_id in range(0, self._num_replicas):
         trace_file_path = os.path.join(
-            self._trace_dir,
+            self._parameters.trace_dir,
             _COMPACT_TRACE_FILE_PREFIX) + '%d'%replica_id
         if not gfile.Exists(trace_file_path):
           raise RuntimeError(
               '%s must be pre-created with the '
               'appropriate properties.'%trace_file_path)
     else:
-      if not gfile.Exists(self._trace_dir):
-        gfile.MkDir(self._trace_dir)
-        if not gfile.Exists(self._trace_dir):
-          raise RuntimeError('Failed to create %s'%self._trace_dir)
+      if not gfile.Exists(self._parameters.trace_dir):
+        gfile.MkDir(self._parameters.trace_dir)
+        if not gfile.Exists(self._parameters.trace_dir):
+          raise RuntimeError('Failed to create %s'%self._parameters.trace_dir)
 
   def _pre_tracing(self, graph, fetches):
     """Work needs to be done prior to TPU or CPU tracing."""
@@ -1455,8 +1084,8 @@ class TensorTracer(object):
             replica_id_str = replica_id
           else:
             replica_id_str = '%d'%replica_id
-          if self._trace_dir:
-            output_path = os.path.join(self._trace_dir,
+          if self._parameters.trace_dir:
+            output_path = os.path.join(self._parameters.trace_dir,
                                        _COMPACT_TRACE_FILE_PREFIX) \
                                        + replica_id_str
             output_stream = _OUTPUT_STREAM_ESCAPE + output_path
@@ -1708,12 +1337,12 @@ class TensorTracer(object):
                 predicate_tensor, lambda: trace_fn(out_tensor, out_tensor_name),
                 lambda: constant_op.constant(False)).op
 
-          if self._is_conditional_trace:
+          if self._parameters.is_conditional_trace:
             trace_op = conditional_trace_fn(processed_out_tensor, out_tensor,
                                             tpu_wrap_trace_fn, tensor_name)
-          elif self._included_cores:
+          elif self._parameters.included_cores:
             should_print = constant_op.constant(False)
-            for core in self._included_cores:
+            for core in self._parameters.included_cores:
               should_print = gen_math_ops.logical_or(
                   should_print, gen_math_ops.equal(self._replica_id, core))
             trace_op = conditional_trace_fn(should_print, processed_out_tensor,
@@ -1800,15 +1429,15 @@ class TensorTracer(object):
       # Checks for the assumption in _generate_flush_cache_op().
       raise RuntimeError('num_replicas_per_host (%d) is '
                          'greater than 8'%self._num_replicas_per_host)
-    if self._graph_dump_path:
-      graph_io.write_graph(graph, self._graph_dump_path,
+    if self._parameters.graph_dump_path:
+      graph_io.write_graph(graph, self._parameters.graph_dump_path,
                            'graph_before_tt.pbtxt')
     with graph.as_default():
       self._add_replica_id_to_graph()
       tensor_fetches = self._trace_execution(graph, tensor_fetches, op_fetches,
                                              on_tpu=True)
-    if self._graph_dump_path:
-      graph_io.write_graph(graph, self._graph_dump_path,
+    if self._parameters.graph_dump_path:
+      graph_io.write_graph(graph, self._parameters.graph_dump_path,
                            'graph_after_tt.pbtxt')
     return tensor_fetches
 
@@ -1843,13 +1472,13 @@ class TensorTracer(object):
     self._num_replicas_per_host = 1
     self._num_hosts = 1
     self._replica_id = 0
-    if self._graph_dump_path:
-      graph_io.write_graph(graph, self._graph_dump_path,
+    if self._parameters.graph_dump_path:
+      graph_io.write_graph(graph, self._parameters.graph_dump_path,
                            'graph_before_tt.pbtxt')
     with graph.as_default():
       tensor_fetches = self._trace_execution(graph, tensor_fetches, op_fetches,
                                              on_tpu=False)
-    if self._graph_dump_path:
-      graph_io.write_graph(graph, self._graph_dump_path,
+    if self._parameters.graph_dump_path:
+      graph_io.write_graph(graph, self._parameters.graph_dump_path,
                            'graph_after_tt.pbtxt')
     return tensor_fetches
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
new file mode 100644
index 00000000000..bf2752da64b
--- /dev/null
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -0,0 +1,379 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========================================================================
+"""Utilities to handle tensor tracer parameters."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import os
+import os.path
+import re
+
+from tensorflow.python.platform import tf_logging as logging
+
+TRACE_MODE_NAN_INF = 'nan-inf'
+TRACE_MODE_PART_TENSOR = 'part-tensor'
+TRACE_MODE_FULL_TENSOR = 'full-tensor'
+TRACE_MODE_FULL_IF_NAN = 'trace-back-if-nan'
+TRACE_MODE_NORM = 'norm'
+TRACE_MODE_MAX_ABS = 'max-abs'
+_FLAG_NAME_TRACE_STACK_SIZE = 'trace_stack_size'
+_SUBMODE_BRIEF = 'brief'
+_SUBMODE_DETAILED = 'detailed'
+_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
+_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
+_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
+_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
+_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
+_FLAG_NAME_ENABLE = 'enable'
+_FLAG_NAME_TRACE_MODE = 'trace_mode'
+_FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
+_FLAG_NAME_TRACE_SCALAR_OPS = 'trace_scalar'
+_FLAG_NAME_TRACE_BEFORE_OPS = 'trace_before_included_ops'
+_FLAG_NAME_TRACE_AFTER_OPS = 'trace_after_included_ops'
+_FLAG_NAME_SUBMODE = 'submode'
+_FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
+_FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
+_FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
+_FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
+_FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
+_FLAG_NAME_INCLUDED_CORES = 'included_cores'
+_FLAG_NAME_TRACE_DIR = 'trace_dir'
+_FLAG_NAME_REPORT_FILE = 'report_file'
+_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
+_FLAG_NAME_OP_RANGE = 'op_range'
+# Folder to dump the pre (before tensor tracer updates) and post graphs (after
+# tensor tracer updates).
+_FLAG_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
+_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
+_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
+
+
+class TTParameters(object):
+  """A class that handles the parameters of Tensor Tracer."""
+
+  def __init__(self, env=None):
+    if env:
+      self._env = env
+    else:
+      self._env = os.environ
+    self._validate_flag_names()
+    self.trace_mode = self._get_trace_mode()
+    self.submode = self._get_submode()
+    self.trace_dir = self._get_trace_dir()
+    self.report_file_path = self._get_report_filepath()
+    self.op_range = self._get_op_range()
+    self.excluded_opname_re_list = self._flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPNAMES)
+    self.excluded_optype_re_list = self._flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPTYPES)
+
+    self.included_opname_re_list = self._flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPNAMES)
+    self.included_optype_re_list = self._flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPTYPES)
+
+    self.is_conditional_trace = self._is_conditional_trace_mode()
+    self.trace_scalar_ops = self.is_flag_on(_FLAG_NAME_TRACE_SCALAR_OPS)
+    self.use_compact_trace = self.is_flag_on(_FLAG_NAME_USE_COMPACT_TRACE)
+
+    # _trace_ops_before_included and _trace_ops_after_included denotes to depth
+    # of tracing relative to the ops given in --included_opnames or
+    # --included_optypes
+    # For example, in the below graph
+    #                op1 --> op2 --> op3 --> op4 --> op5
+    # If --included_opnames=op3 then only op3 will be traced.
+    # If also --trace_before_included_ops=2 (_trace_ops_before_included), then
+    # op1 and op2 will be traced as they are at most 2 hops apart from an
+    # included op. Similarly, if --trace_after_included_ops=2, then op4 and op5
+    # will also be traced.
+    self.trace_ops_before_included = self._get_flag_int_value(
+        _FLAG_NAME_TRACE_BEFORE_OPS, 0)
+    self.trace_ops_after_included = self._get_flag_int_value(
+        _FLAG_NAME_TRACE_AFTER_OPS, 0)
+    self.trace_stack_size = self._get_flag_int_value(
+        _FLAG_NAME_TRACE_STACK_SIZE, 1)
+    _, self.graph_dump_path = self.get_flag_value(
+        _FLAG_DUMP_BEFORE_AFTER_GRAPHS)
+    self.included_cores = self._flag_value_as_int_list(
+        _FLAG_NAME_INCLUDED_CORES)
+    self.include_less_interesting_ops, _ = self.get_flag_value(
+        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
+
+  def _is_conditional_trace_mode(self):
+    return self.trace_mode == TRACE_MODE_FULL_IF_NAN
+
+  def _get_report_filepath(self):
+    """Sets the path of the output report file."""
+
+    found, report_file_path = self.get_flag_value(
+        _FLAG_NAME_REPORT_FILE)
+    if found and report_file_path \
+       and self.use_test_undeclared_outputs_dir():
+      if os.path.isabs(report_file_path):
+        raise ValueError('If use_test_undeclared_outputs_dir is set,'
+                         'report_file_path cannot be an absolute path (%s)'
+                         %report_file_path)
+      outputs_dir = self._env.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+      report_file_path = os.path.join(outputs_dir, report_file_path)
+    return report_file_path
+
+  def _get_op_range(self):
+    """Sets the index range of the Ops that we will consider tracing."""
+    found, op_range = self.get_flag_value(_FLAG_NAME_OP_RANGE)
+    if not found or not op_range:
+      op_range = (-1, -1)  # this means including all ops.
+      return op_range
+    match = _OP_RANGE_PAT.match(op_range)
+    if not match:
+      op_range = (-1, -1)  # this means including all ops.
+      return op_range
+    op_range = (int(match.group(1)), int(match.group(2)))
+    return op_range
+
+  def _get_trace_dir(self):
+    found, trace_dir = self.get_flag_value(_FLAG_NAME_TRACE_DIR)
+    if found and trace_dir \
+       and self.use_test_undeclared_outputs_dir():
+      raise ValueError('Cannot not use --%s and --%s at the same time'
+                       %(_FLAG_NAME_TRACE_DIR,
+                         _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR))
+    if self.use_test_undeclared_outputs_dir():
+      trace_dir = self._env.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+    return trace_dir
+
+  def _get_trace_mode(self):
+    """Checks if the given trace mode is valid."""
+
+    found, trace_mode = self.get_flag_value(_FLAG_NAME_TRACE_MODE)
+    if not found or not trace_mode:
+      trace_mode = TRACE_MODE_NORM
+    valid_trace_modes = [
+        TRACE_MODE_NAN_INF, TRACE_MODE_PART_TENSOR, TRACE_MODE_FULL_TENSOR,
+        TRACE_MODE_NORM, TRACE_MODE_MAX_ABS, TRACE_MODE_FULL_IF_NAN
+    ]
+    if trace_mode not in valid_trace_modes:
+      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
+                       'Valid trace modes are: %s'%(trace_mode,
+                                                    valid_trace_modes))
+    return trace_mode
+
+  def is_brief_mode(self):
+    return self.submode == _SUBMODE_BRIEF
+
+  def _get_submode(self):
+    """Checks if the given submode is valid."""
+
+    found, submode = self.get_flag_value(_FLAG_NAME_SUBMODE)
+    if not found or not submode:
+      submode = _SUBMODE_DETAILED
+    if not submode:
+      return
+    valid_submodes = [_SUBMODE_DETAILED, _SUBMODE_BRIEF]
+    if submode not in valid_submodes:
+      raise ValueError('Invalid submode "%s" given to the Tensor_Tracer.'
+                       'Valid submodes are: %s'%(submode,
+                                                 valid_submodes))
+    return submode
+
+  @staticmethod
+  def match_next_flag(flags, pos):
+    """Returns the match for the next TensorTracer flag.
+
+    Args:
+       flags: a string that contains the flags.
+       pos: where in flags to start the search.
+
+    Returns:
+       A pair where the first element is the regular-expression
+       match found and the second element indicates if the match
+       has a value.
+    """
+
+    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_NO_EQUAL_PAT.match(flags, pos)
+    if match:
+      # The flag is found but is not given a value.
+      return match, False
+    # The flag is not found.
+    return None, False
+
+  def _validate_flag_names(self):
+    """Validates if the TensorTrace flags passed are valid."""
+    valid_flag_names = [
+        _FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE, _FLAG_NAME_USE_COMPACT_TRACE,
+        _FLAG_NAME_TRACE_SCALAR_OPS, _FLAG_NAME_TRACE_BEFORE_OPS,
+        _FLAG_NAME_TRACE_AFTER_OPS, _FLAG_NAME_TRACE_STACK_SIZE,
+        _FLAG_NAME_SUBMODE, _FLAG_NAME_EXCLUDED_OPNAMES,
+        _FLAG_NAME_EXCLUDED_OPTYPES, _FLAG_NAME_INCLUDED_OPNAMES,
+        _FLAG_NAME_INCLUDED_OPTYPES, _FLAG_NAME_TRACE_DIR,
+        _FLAG_NAME_INCLUDED_CORES, _FLAG_NAME_REPORT_FILE,
+        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
+        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, _FLAG_NAME_OP_RANGE,
+        _FLAG_DUMP_BEFORE_AFTER_GRAPHS
+    ]
+    tensor_tracer_flags = self._env.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return
+    pos = 0
+    while True:
+      match, _ = TTParameters.match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      if flag_name not in valid_flag_names:
+        raise ValueError(
+            'The flag name "%s" passed via the environment variable "%s" '
+            'is invalid. Valid flag names are:'
+            '\n%s'%(flag_name, _FLAGS_ENV_VAR, valid_flag_names))
+      pos = match.end()
+
+  def _flag_value_as_int_list(self, wanted_flag_name):
+    """Returns the integer list of a TensorTracer flag.
+
+    Args:
+      wanted_flag_name: the name of the flag we are looking for.
+
+    Returns:
+      the value of the flag.
+    Raises:
+      RuntimeError: If supposedly deadcode is reached.
+    """
+    int_list = []
+    found, flag_value = self.get_flag_value(wanted_flag_name)
+
+    if found:
+      try:
+        integer_values = flag_value.split(',')
+        int_list = [int(int_val) for int_val in integer_values]
+      except ValueError:
+        logging.warning('Cannot convert %s to int for flag %s', int_list,
+                        wanted_flag_name)
+    return int_list
+
+  def _get_flag_int_value(self, wanted_flag_name, default_value):
+    """Returns the int value of a TensorTracer flag.
+
+    Args:
+      wanted_flag_name: the name of the flag we are looking for.
+      default_value: the default value for the flag, if not provided.
+    Returns:
+      the value of the flag.
+    Raises:
+      RuntimeError: If supposedly deadcode is reached.
+    """
+    flag_int_value = default_value
+    found, flag_value = self.get_flag_value(wanted_flag_name)
+
+    if found:
+      try:
+        flag_int_value = int(flag_value)
+      except ValueError:
+        logging.warning('Cannot convert %s to int for flag %s' % (
+            flag_int_value, wanted_flag_name))
+    return flag_int_value
+
+  def get_flag_value(self, wanted_flag_name):
+    """Returns the value of a TensorTracer flags.
+
+    Args:
+      wanted_flag_name: the name of the flag we are looking for.
+
+    Returns:
+      A pair where the first element indicates if the flag is
+      found and the second element is the value of the flag.
+
+    Raises:
+      RuntimeError: If supposedly deadcode is reached.
+    """
+
+    tensor_tracer_flags = self._env.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return False, None
+    pos = 0
+    while True:
+      match, has_value = TTParameters.match_next_flag(
+          tensor_tracer_flags, pos)
+      if not match:
+        return False, None
+      flag_name = match.group(1)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
+      if flag_name == wanted_flag_name:
+        return True, flag_value
+      pos = match.end()
+    raise RuntimeError('Should not reach here.')
+
+  def _flag_value_to_re_list(self, flag_name):
+    """Converts list of strings to compiled RE."""
+
+    re_list = []
+    found, flag_value = self.get_flag_value(flag_name)
+    if not found or not flag_value:
+      return re_list
+    list_of_values = flag_value.split()
+    for v in list_of_values:
+      r = re.compile(v)
+      re_list.append(r)
+    return re_list
+
+  def is_flag_on(self, flag_name):
+    """Returns True if the given flag is on."""
+
+    found, flag_value = self.get_flag_value(flag_name)
+    if not found:
+      return False
+    if flag_value is None:
+      return True
+    # Depends on the flag value.
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  def is_enabled(self):
+    """Returns True if TensorTracer is enabled."""
+
+    if self.is_flag_on(_FLAG_NAME_ENABLE):
+      logging.info('Tensor Tracer is enabled with flags %s.' %
+                   self._env.get(_FLAGS_ENV_VAR))
+      return True
+    else:
+      return False
+
+  def use_test_undeclared_outputs_dir(self):
+    """Decides the output directory of the report and trace files.
+
+    Args:
+       None.
+
+    Returns:
+       True if the output files should be written to the
+       test-undeclared-outputs-directory defined via an
+       env variable.
+    """
+
+    return self.is_flag_on(_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index 7e18c3b1d65..b57467bf2b3 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -32,6 +32,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 _INITIALIZED_TPU_SYSTEMS = {}
+_LOCAL_MASTERS = ("", "local")
 
 
 @tf_export("tpu.experimental.initialize_tpu_system")
@@ -57,7 +58,7 @@ def initialize_tpu_system(cluster_resolver=None):
                     "Reinitializing the TPU can cause previously created "
                     "variables on TPU to be lost.")
 
-  logging.info("Initializing the TPU system.")
+  logging.info("Initializing the TPU system: %s", tpu_name)
 
   if context.executing_eagerly():
     # This function looks as it is for the following non-intuitive reasons.
@@ -67,7 +68,13 @@ def initialize_tpu_system(cluster_resolver=None):
     # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
     @function.defun
     def _tpu_init_fn():
-      return tpu.initialize_system()
+      if tpu_name in _LOCAL_MASTERS:
+        job = None
+      else:
+        # Explicitly place the tpu.initialize_system in the first worker to
+        # avoid the output node match multiple devices error.
+        job = "worker/replica:0/task:0"
+      return tpu.initialize_system(job=job)
 
     tpu_devices = sorted(
         [x for x in context.list_devices() if "device:TPU:" in x])
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 11f6a2f8069..d470021ad68 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
 import six
 
 from tensorflow.python import pywrap_tensorflow
@@ -35,7 +36,8 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
-    "load_checkpoint", "load_variable", "list_variables", "init_from_checkpoint"
+    "load_checkpoint", "load_variable", "list_variables",
+    "checkpoints_iterator", "init_from_checkpoint"
 ]
 
 
@@ -101,6 +103,103 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
+def wait_for_new_checkpoint(checkpoint_dir,
+                            last_checkpoint=None,
+                            seconds_to_sleep=1,
+                            timeout=None):
+  """Waits until a new checkpoint file is found.
+
+  Args:
+    checkpoint_dir: The directory in which checkpoints are saved.
+    last_checkpoint: The last checkpoint path used or `None` if we're expecting
+      a checkpoint for the first time.
+    seconds_to_sleep: The number of seconds to sleep for before looking for a
+      new checkpoint.
+    timeout: The maximum number of seconds to wait. If left as `None`, then the
+      process will wait indefinitely.
+
+  Returns:
+    a new checkpoint path, or None if the timeout was reached.
+  """
+  logging.info("Waiting for new checkpoint at %s", checkpoint_dir)
+  stop_time = time.time() + timeout if timeout is not None else None
+  while True:
+    checkpoint_path = checkpoint_management.latest_checkpoint(checkpoint_dir)
+    if checkpoint_path is None or checkpoint_path == last_checkpoint:
+      if stop_time is not None and time.time() + seconds_to_sleep > stop_time:
+        return None
+      time.sleep(seconds_to_sleep)
+    else:
+      logging.info("Found new checkpoint at %s", checkpoint_path)
+      return checkpoint_path
+
+
+@tf_export("train.checkpoints_iterator")
+def checkpoints_iterator(checkpoint_dir,
+                         min_interval_secs=0,
+                         timeout=None,
+                         timeout_fn=None):
+  """Continuously yield new checkpoint files as they appear.
+
+  The iterator only checks for new checkpoints when control flow has been
+  reverted to it. This means it can miss checkpoints if your code takes longer
+  to run between iterations than `min_interval_secs` or the interval at which
+  new checkpoints are written.
+
+  The `timeout` argument is the maximum number of seconds to block waiting for
+  a new checkpoint.  It is used in combination with the `timeout_fn` as
+  follows:
+
+  * If the timeout expires and no `timeout_fn` was specified, the iterator
+    stops yielding.
+  * If a `timeout_fn` was specified, that function is called and if it returns
+    a true boolean value the iterator stops yielding.
+  * If the function returns a false boolean value then the iterator resumes the
+    wait for new checkpoints.  At this point the timeout logic applies again.
+
+  This behavior gives control to callers on what to do if checkpoints do not
+  come fast enough or stop being generated.  For example, if callers have a way
+  to detect that the training has stopped and know that no new checkpoints
+  will be generated, they can provide a `timeout_fn` that returns `True` when
+  the training has stopped.  If they know that the training is still going on
+  they return `False` instead.
+
+  Args:
+    checkpoint_dir: The directory in which checkpoints are saved.
+    min_interval_secs: The minimum number of seconds between yielding
+      checkpoints.
+    timeout: The maximum number of seconds to wait between checkpoints. If left
+      as `None`, then the process will wait indefinitely.
+    timeout_fn: Optional function to call after a timeout.  If the function
+      returns True, then it means that no new checkpoints will be generated and
+      the iterator will exit.  The function is called with no arguments.
+
+  Yields:
+    String paths to latest checkpoint files as they arrive.
+  """
+  checkpoint_path = None
+  while True:
+    new_checkpoint_path = wait_for_new_checkpoint(
+        checkpoint_dir, checkpoint_path, timeout=timeout)
+    if new_checkpoint_path is None:
+      if not timeout_fn:
+        # timed out
+        logging.info("Timed-out waiting for a checkpoint.")
+        return
+      if timeout_fn():
+        # The timeout_fn indicated that we are truly done.
+        return
+      else:
+        # The timeout_fn indicated that more checkpoints may come.
+        continue
+    start = time.time()
+    checkpoint_path = new_checkpoint_path
+    yield checkpoint_path
+    time_to_next_eval = start + min_interval_secs - time.time()
+    if time_to_next_eval > 0:
+      time.sleep(time_to_next_eval)
+
+
 @tf_export(v1=["train.init_from_checkpoint"])
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """Replaces `tf.Variable` initializers so they load from a checkpoint file.
@@ -363,6 +462,7 @@ def _is_variable(x):
   return (isinstance(x, variables.Variable) or
           resource_variable_ops.is_resource_variable(x))
 
+
 def _collect_partitioned_variable(name, all_vars):
   """Returns list of `tf.Variable` that comprise the partitioned variable."""
   if name + "/part_0" in all_vars:
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index a3e58de4a31..59972e6b230 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -19,19 +19,25 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import time
 import numpy as np
 
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 def _create_checkpoints(sess, checkpoint_dir):
@@ -203,7 +209,7 @@ class CheckpointsTest(test.TestCase):
     with ops.Graph().as_default():
       with ops.device("/job:ps"):
         with variable_scope.variable_scope("useful_scope"):
-          my4 = variable_scope.get_variable("var4", [9, 9])
+          variable_scope.get_variable("var4", [9, 9])
 
       checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                             {"useful_scope/": "useful_scope/"})
@@ -390,5 +396,95 @@ class CheckpointsTest(test.TestCase):
     self.assertEqual(ops_in_init_from_checkpoint_scope, [])
 
 
+class CheckpointIteratorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testReturnsEmptyIfNoCheckpointsFound(self):
+    checkpoint_dir = os.path.join(self.get_temp_dir(), "no_checkpoints_found")
+
+    num_found = 0
+    for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0):
+      num_found += 1
+    self.assertEqual(num_found, 0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testReturnsSingleCheckpointIfOneCheckpointFound(self):
+    checkpoint_dir = os.path.join(self.get_temp_dir(), "one_checkpoint_found")
+    if not gfile.Exists(checkpoint_dir):
+      gfile.MakeDirs(checkpoint_dir)
+
+    save_path = os.path.join(checkpoint_dir, "model.ckpt")
+
+    a = resource_variable_ops.ResourceVariable(5)
+    self.evaluate(a.initializer)
+    checkpoint = trackable_utils.Checkpoint(a=a)
+    checkpoint.save(file_prefix=save_path)
+
+    num_found = 0
+    for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0):
+      num_found += 1
+    self.assertEqual(num_found, 1)
+
+  @test_util.run_v1_only("Tests v1-style checkpoint sharding")
+  def testReturnsSingleCheckpointIfOneShardedCheckpoint(self):
+    checkpoint_dir = os.path.join(self.get_temp_dir(),
+                                  "one_checkpoint_found_sharded")
+    if not gfile.Exists(checkpoint_dir):
+      gfile.MakeDirs(checkpoint_dir)
+
+    global_step = variables.Variable(0, name="v0")
+
+    # This will result in 3 different checkpoint shard files.
+    with ops.device("/cpu:0"):
+      variables.Variable(10, name="v1")
+    with ops.device("/cpu:1"):
+      variables.Variable(20, name="v2")
+
+    saver = saver_lib.Saver(sharded=True)
+
+    with session_lib.Session(
+        target="",
+        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as session:
+
+      session.run(variables.global_variables_initializer())
+      save_path = os.path.join(checkpoint_dir, "model.ckpt")
+      saver.save(session, save_path, global_step=global_step)
+
+    num_found = 0
+    for _ in checkpoint_utils.checkpoints_iterator(checkpoint_dir, timeout=0):
+      num_found += 1
+    self.assertEqual(num_found, 1)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTimeoutFn(self):
+    timeout_fn_calls = [0]
+    def timeout_fn():
+      timeout_fn_calls[0] += 1
+      return timeout_fn_calls[0] > 3
+
+    results = list(
+        checkpoint_utils.checkpoints_iterator(
+            "/non-existent-dir", timeout=0.1, timeout_fn=timeout_fn))
+    self.assertEqual([], results)
+    self.assertEqual(4, timeout_fn_calls[0])
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class WaitForNewCheckpointTest(test.TestCase):
+
+  def testReturnsNoneAfterTimeout(self):
+    start = time.time()
+    ret = checkpoint_utils.wait_for_new_checkpoint(
+        "/non-existent-dir", "foo", timeout=1.0, seconds_to_sleep=0.5)
+    end = time.time()
+    self.assertIsNone(ret)
+
+    # We've waited one second.
+    self.assertGreater(end, start + 0.5)
+
+    # The timeout kicked in.
+    self.assertLess(end, start + 1.1)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/experimental/mixed_precision.py b/tensorflow/python/training/experimental/mixed_precision.py
index 9285752196b..949c4981bfb 100644
--- a/tensorflow/python/training/experimental/mixed_precision.py
+++ b/tensorflow/python/training/experimental/mixed_precision.py
@@ -27,7 +27,7 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _wrap_optimizer(opt, loss_scale, use_v1_behavior):
+def _wrap_optimizer(opt, loss_scale):
   """Wraps an optimizer with a LossScaleOptimizer."""
 
   if isinstance(opt, loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer):
@@ -67,54 +67,12 @@ def _wrap_optimizer(opt, loss_scale, use_v1_behavior):
     from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as loss_scale_optimizer_v2  # pylint: disable=g-import-not-at-top
     return loss_scale_optimizer_v2.LossScaleOptimizer(opt, loss_scale)
 
-  if use_v1_behavior:
-    raise ValueError('"opt" must be an instance of a tf.train.Optimizer or a '
-                     'tf.keras.optimizers.Optimizer, but got: %s' % opt)
-  else:
-    raise ValueError('"opt" must be an instance of a '
-                     'tf.keras.optimizers.Optimizer, but got: %s' % opt)
-
-
-@tf_export('train.experimental.enable_mixed_precision_graph_rewrite', v1=[])
-def enable_mixed_precision_graph_rewrite(opt, loss_scale='dynamic'):
-  """Enable mixed precision in `tf.function`s via a graph rewrite.
-
-  Mixed precision is the use of both float16 and float32 when training a model,
-  and is used to make the model run faster. This function will use mixed
-  precision to speed up the execution time of `tf.function`s when run on a GPU.
-  It does this by changing the dtype of certain operations in the function's
-  graph from float32 to float16.
-
-  This function additionally wraps an Optimizer with a LossScaleOptimizer, which
-  is required to prevent underflow in the float16 tensors during the backwards
-  pass. An optimizer must be passed to this function, which will then be wrapped
-  to use loss scaling.
-
-  When this function is used, gradients should only be computed and applied with
-  the returned optimizer through `opt.minimize()`, and not with a
-  `tf.GradientTape`. This is because the returned optimizer will apply loss
-  scaling, and `tf.GradientTape` will not. If you do use a `tf.GradientTape`,
-  your model may train to a worse quality.
-
-  Currently, mixed precision is only enabled on Volta GPUs and above. TPU
-  support is coming soon. CPUs are not supported, as CPUs do not run float16
-  operations faster than float32 operations.
-
-  Args:
-    opt: An instance of a `tf.keras.optimizers.Optimizer`.
-    loss_scale: Either an int/float, the string "dynamic", or an instance of a
-      `tf.train.experimental.LossScale`. The loss scale to use. It is
-      recommended to keep this as its default value of "dynamic".
-
-  Returns:
-    A version of `opt` that will use loss scaling to prevent underflow.
-  """
-  return _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
-                                                    use_v1_behavior=False)
+  raise ValueError('"opt" must be an instance of a tf.train.Optimizer or a '
+                   'tf.keras.optimizers.Optimizer, but got: %s' % opt)
 
 
 @tf_export(v1=['train.experimental.enable_mixed_precision_graph_rewrite'])
-def enable_mixed_precision_graph_rewrite_v1(opt, loss_scale='dynamic'):
+def enable_mixed_precision_graph_rewrite(opt, loss_scale='dynamic'):
   """Enable mixed precision via a graph rewrite.
 
   Mixed precision is the use of both float16 and float32 when training a model,
@@ -136,9 +94,11 @@ def enable_mixed_precision_graph_rewrite_v1(opt, loss_scale='dynamic'):
   `tf.gradients`/`tf.GradientTape` will not. If you do directly use
   `tf.gradients` or `tf.GradientTape`, your model may train to a worse quality.
 
-  Currently, mixed precision is only enabled on Volta GPUs and above. TPU
-  support is coming soon. CPUs are not supported, as CPUs do not run float16
-  operations faster than float32 operations.
+  When eager execution is enabled, the mixed precision graph rewrite is only
+  enabled within `tf.function`s, as outside `tf.function`s, there is no graph.
+
+  When enabled, mixed precision is only used on Volta GPUs and above. The parts
+  of the graph on CPUs and TPUs are untouched by the graph rewrite.
 
   Args:
     opt: An instance of a `tf.keras.optimizers.Optimizer` or a
@@ -152,51 +112,35 @@ def enable_mixed_precision_graph_rewrite_v1(opt, loss_scale='dynamic'):
   """
   # TODO(reedwm): If a ConfigProto is passed to Session, either assert that
   # auto_mixed_precision is on or turn it on for the user.
-  return _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
-                                                    use_v1_behavior=True)
+  if not mixed_precision_global_state.using_default_mixed_precision_policy:
+    raise ValueError(
+        'The mixed precision graph rewrite cannot be enabled, because a keras '
+        'mixed precision Policy has been set. At most, one of the following '
+        'functions can be called:\n\n'
+        '  1. tf.keras.mixed_precision.experimental.set_policy() (You called '
+        'this first)\n'
+        '  2. tf.train.experimental.enable_mixed_precision_graph_rewrite() '
+        '(You called this second)\n\n'
+        'You called both functions, which is an error, because both functions '
+        'enable you to use mixed precision. The second function enables mixed '
+        'precision in the graph with a graph rewrite. However it is currently '
+        'not very customizable, and does not support eager. The first '
+        'function is for Keras layers, but is not yet fully complete.')
 
-
-def _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
-                                               use_v1_behavior):
-  """Enables mixed precision. See `enable_mixed_precision_graph_rewrite`."""
   if mixed_precision_global_state.non_mixed_precision_session_created:
     # TODO(reedwm): Give the stacktrace of the existing Sessions. And if the
     # Sessions have already been closed, do not raise this error message.
     tf_logging.warn('You already have existing Sessions that do not use mixed '
                     'precision. enable_mixed_precision_graph_rewrite() will '
                     'not affect these Sessions.')
-  opt = _wrap_optimizer(opt, loss_scale, use_v1_behavior=use_v1_behavior)
+  opt = _wrap_optimizer(opt, loss_scale)
   config.set_optimizer_experimental_options({'auto_mixed_precision': True})
-  mixed_precision_global_state.mixed_precision_is_enabled = True
+  mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled = True
   return opt
 
 
-@tf_export('train.experimental.disable_mixed_precision_graph_rewrite', v1=[])
-def disable_mixed_precision_graph_rewrite():
-  """Disables the mixed precision graph rewrite.
-
-  After this is called, the mixed precision graph rewrite will no longer run for
-  tf.functions, and so float32 operations will no longer be converted to
-  float16.
-
-  This does not undo the effects of loss scaling. Any optimizers wrapped with a
-  LossScaleOptimizer will continue to do loss scaling, although this loss
-  scaling will no longer be useful, as the graph rewrite no longer converts
-  tf.functions to use float16.
-
-  This function is useful for unit testing. A unit test can test using the mixed
-  precision graph rewrite, then disable it so future unit tests continue using
-  float32.
-  """
-  if not mixed_precision_global_state.mixed_precision_is_enabled:
-    tf_logging.warn('disable_mixed_precision_graph_rewrite() called when mixed '
-                    'precision is already disabled.')
-  config.set_optimizer_experimental_options({'auto_mixed_precision': False})
-  mixed_precision_global_state.mixed_precision_is_enabled = False
-
-
 @tf_export(v1=['train.experimental.disable_mixed_precision_graph_rewrite'])
-def disable_mixed_precision_graph_rewrite_v1():
+def disable_mixed_precision_graph_rewrite():
   """Disables the mixed precision graph rewrite.
 
   After this is called, the mixed precision graph rewrite will no longer run for
@@ -217,6 +161,8 @@ def disable_mixed_precision_graph_rewrite_v1():
   as `enable_mixed_precision_graph_rewrite` and
   `disable_mixed_precision_graph_rewrite` have no effect on existing sessions.
   """
-  # We only have a separate V1 version of this function, because the V1
-  # docstring mentions sessions.
-  disable_mixed_precision_graph_rewrite()
+  if not mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled:
+    tf_logging.warn('disable_mixed_precision_graph_rewrite() called when mixed '
+                    'precision is already disabled.')
+  config.set_optimizer_experimental_options({'auto_mixed_precision': False})
+  mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled = False
diff --git a/tensorflow/python/training/experimental/mixed_precision_global_state.py b/tensorflow/python/training/experimental/mixed_precision_global_state.py
index ffb3f908e96..8a2cd8a0160 100644
--- a/tensorflow/python/training/experimental/mixed_precision_global_state.py
+++ b/tensorflow/python/training/experimental/mixed_precision_global_state.py
@@ -23,12 +23,17 @@ from __future__ import division
 from __future__ import print_function
 
 
-# Whether mixed precision has been enabled or not with
+# Whether the mixed precision graph rewrite has been enabled or not with
 # `enable_mixed_precision_graph_rewrite`. Used to turn on auto_mixed_precision
 # in ConfigProtos passed to Sessions.
-mixed_precision_is_enabled = False
+mixed_precision_graph_rewrite_is_enabled = False
 
-# True if a Session has been created without mixed precision being enabled. Used
-# to give a warning if mixed precision is enabled after a Session has already
-# been created.
+# True if a Session has been created without the mixed precision graph rewrite
+# being enabled. Used to give a warning if mixed precision is enabled after a
+# Session has already been created.
 non_mixed_precision_session_created = False
+
+# Whether the default tf.keras.mixed_precision.experimental.Policy is in effect.
+# Used to raise an error message if both a non-default Policy and the graph
+# rewrite are used at the same time.
+using_default_mixed_precision_policy = True
diff --git a/tensorflow/python/training/experimental/mixed_precision_test.py b/tensorflow/python/training/experimental/mixed_precision_test.py
index 1ee3fe93d57..162aee53d7c 100644
--- a/tensorflow/python/training/experimental/mixed_precision_test.py
+++ b/tensorflow/python/training/experimental/mixed_precision_test.py
@@ -21,13 +21,13 @@ import os
 from absl.testing import parameterized
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as loss_scale_optimizer_v2
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -40,14 +40,6 @@ from tensorflow.python.training.experimental import mixed_precision
 from tensorflow.python.training.experimental import mixed_precision_global_state
 
 
-if tf2.enabled():
-  enable_mixed_precision_graph_rewrite = (
-      mixed_precision.enable_mixed_precision_graph_rewrite)
-else:
-  enable_mixed_precision_graph_rewrite = (
-      mixed_precision.enable_mixed_precision_graph_rewrite_v1)
-
-
 class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
 
   IGNORE_PERF_VAR = 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'
@@ -72,13 +64,13 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_wrap_optimizer(self):
     opt = gradient_descent_v1.GradientDescentOptimizer(1.0)
-    opt = enable_mixed_precision_graph_rewrite(opt, 123.)
+    opt = mixed_precision.enable_mixed_precision_graph_rewrite(opt, 123.)
     self.assertIsInstance(
         opt, loss_scale_optimizer_v1.MixedPrecisionLossScaleOptimizer)
     self.assertEqual(self.evaluate(opt._loss_scale()), 123.)
 
     opt = gradient_descent_v2.SGD(1.0)
-    opt = enable_mixed_precision_graph_rewrite(opt, 123.)
+    opt = mixed_precision.enable_mixed_precision_graph_rewrite(opt, 123.)
     self.assertIsInstance(
         opt, loss_scale_optimizer_v2.LossScaleOptimizer)
     self.assertEqual(self.evaluate(opt._loss_scale()), 123.)
@@ -86,14 +78,10 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_optimizer_errors(self):
     opt = 1
-    if tf2.enabled():
-      expected_regex = ('"opt" must be an instance of a '
-                        'tf.keras.optimizers.Optimizer, but got')
-    else:
-      expected_regex = ('"opt" must be an instance of a tf.train.Optimizer or '
-                        'a tf.keras.optimizers.Optimizer, but got')
+    expected_regex = ('"opt" must be an instance of a tf.train.Optimizer or '
+                      'a tf.keras.optimizers.Optimizer, but got')
     with self.assertRaisesRegexp(ValueError, expected_regex):
-      enable_mixed_precision_graph_rewrite(opt)
+      mixed_precision.enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
 
@@ -103,7 +91,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError,
                                  '"opt" must not already be an instance of a '
                                  'MixedPrecisionLossScaleOptimizer.'):
-      enable_mixed_precision_graph_rewrite(opt)
+      mixed_precision.enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
 
@@ -112,7 +100,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError,
                                  '"opt" must not already be an instance of a '
                                  'LossScaleOptimizer.'):
-      enable_mixed_precision_graph_rewrite(opt)
+      mixed_precision.enable_mixed_precision_graph_rewrite(opt)
     self.assertFalse(config.get_optimizer_experimental_options()
                      .get('auto_mixed_precision', False))
 
@@ -120,7 +108,7 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_grappler_pass_enabled(self):
     opt = gradient_descent_v2.SGD(1.0)
-    enable_mixed_precision_graph_rewrite(opt, 123.)
+    mixed_precision.enable_mixed_precision_graph_rewrite(opt, 123.)
 
     var = variables.Variable([[1.0]])
 
@@ -165,7 +153,8 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     mixed_precision_global_state.non_mixed_precision_session_created = False
 
     with session.Session():
-      enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
+      mixed_precision.enable_mixed_precision_graph_rewrite(
+          gradient_descent_v2.SGD(1.0))
       mock_warn.assert_any_call(
           'You already have existing Sessions that do not use mixed precision. '
           'enable_mixed_precision_graph_rewrite() will not affect these '
@@ -177,7 +166,8 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
     # the warning.
     mixed_precision_global_state.non_mixed_precision_session_created = False
 
-    enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
+    mixed_precision.enable_mixed_precision_graph_rewrite(
+        gradient_descent_v2.SGD(1.0))
     with session.Session():
       # Make sure the "You already have existing Sessions" warning was not
       # issued, since the Session was only created after
@@ -187,6 +177,16 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
         self.assertNotIn('You already have existing Sessions that do not use '
                          'mixed precision', msg)
 
+  def test_error_if_policy_is_set(self):
+    with policy.policy_scope('infer_float32_vars'):
+      with self.assertRaisesRegexp(
+          ValueError, 'a keras mixed precision Policy has been set'):
+        mixed_precision.enable_mixed_precision_graph_rewrite(
+            gradient_descent_v2.SGD(1.0))
+    # Test no error is thrown when the policy is current the default.
+    mixed_precision.enable_mixed_precision_graph_rewrite(
+        gradient_descent_v2.SGD(1.0))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 5577a2b4ae8..469a2d82ca7 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -5,10 +5,9 @@ package(
     default_visibility = [
         "//tensorflow:internal",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index bf73e25afa4..9b0740ecace 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -211,6 +211,7 @@ tf_xla_py_test(
     tags = [
         "no_pip",
         "no_rocm",
+        "no_windows",
         "nomac",
         "notsan",  # b/74395663
     ],
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index 7d1b861cd6e..1ec5466e0ab 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -336,12 +336,10 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     self.assertEqual(
         "my_model/dense/kernel",
         named_variables["model/_named_dense/kernel" + suffix].full_name)
-    self.assertEqual(
-        "beta_1",
-        named_variables["optimizer/beta_1" + suffix].full_name)
-    self.assertEqual(
-        "beta_2",
-        named_variables["optimizer/beta_2" + suffix].full_name)
+    self.assertEqual("Adam/beta_1",
+                     named_variables["optimizer/beta_1" + suffix].full_name)
+    self.assertEqual("Adam/beta_2",
+                     named_variables["optimizer/beta_2" + suffix].full_name)
     # Spot check the generated protocol buffers.
     self.assertEqual("optimizer",
                      serialized_graph.nodes[0].children[1].local_name)
@@ -350,7 +348,7 @@ class CheckpointingTests(parameterized.TestCase, test.TestCase):
     children = [node.local_name for node in optimizer_node.children]
     six.assertCountEqual(
         self,
-        # Non-slot dependencies
+        # hyper variable dependencies
         ["beta_1", "beta_2", "iter", "decay", "learning_rate"],
         children)
     serialized_slot_keys = []
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 50acc0882b7..510e6188196 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -35,7 +35,6 @@ from tensorflow.python.training.adam import AdamOptimizer
 from tensorflow.python.training.ftrl import FtrlOptimizer
 from tensorflow.python.training.experimental.loss_scale_optimizer import MixedPrecisionLossScaleOptimizer
 from tensorflow.python.training.experimental.mixed_precision import enable_mixed_precision_graph_rewrite
-from tensorflow.python.training.experimental.mixed_precision import enable_mixed_precision_graph_rewrite_v1
 from tensorflow.python.training.momentum import MomentumOptimizer
 from tensorflow.python.training.moving_averages import ExponentialMovingAverage
 from tensorflow.python.training.optimizer import Optimizer
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index d73d4371c17..5fc03cf2c18 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import six
 
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -28,6 +29,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -361,6 +363,45 @@ def _get_grouped_variables(vars_to_warm_start):
   return grouped_variables
 
 
+def _get_object_checkpoint_renames(path, variable_names):
+  """Returns a dictionary mapping variable names to checkpoint keys.
+
+  The warm-starting utility expects variable names to match with the variable
+  names in the checkpoint. For object-based checkpoints, the variable names
+  and names in the checkpoint are different. Thus, for object-based checkpoints,
+  this function is used to obtain the map from variable names to checkpoint
+  keys.
+
+  Args:
+    path: path to checkpoint directory or file.
+    variable_names: list of variable names to load from the checkpoint.
+
+  Returns:
+    If the checkpoint is object-based, this function returns a map from variable
+    names to their corresponding checkpoint keys.
+    If the checkpoint is name-based, this returns an empty dict.
+
+  Raises:
+    ValueError: If the object-based checkpoint is missing variables.
+  """
+  fname = checkpoint_utils._get_checkpoint_filename(path)  # pylint: disable=protected-access
+  try:
+    names_to_keys = saver_lib.object_graph_key_mapping(fname)
+  except errors.NotFoundError:
+    # If an error is raised from `object_graph_key_mapping`, then the
+    # checkpoint is name-based. There are no renames, so return an empty dict.
+    return {}
+
+  missing_names = set(variable_names) - set(names_to_keys.keys())
+  if missing_names:
+    raise ValueError(
+        "Attempting to warm-start from an object-based checkpoint, but found "
+        "that the checkpoint did not contain values for all variables. The "
+        "following variables were missing: {}"
+        .format(missing_names))
+  return {name: names_to_keys[name] for name in variable_names}
+
+
 @tf_export(v1=["train.warm_start"])
 def warm_start(ckpt_to_initialize_from,
                vars_to_warm_start=".*",
@@ -412,12 +453,20 @@ def warm_start(ckpt_to_initialize_from,
       a stronger check for variable configuration than relying on users to
       examine the logs.
   """
+  logging.info("Warm-starting from: {}".format(ckpt_to_initialize_from))
+  grouped_variables = _get_grouped_variables(vars_to_warm_start)
+
   if var_name_to_vocab_info is None:
     var_name_to_vocab_info = {}
-  if var_name_to_prev_var_name is None:
-    var_name_to_prev_var_name = {}
-  logging.info("Warm-starting from: %s", (ckpt_to_initialize_from,))
-  grouped_variables = _get_grouped_variables(vars_to_warm_start)
+
+  if not var_name_to_prev_var_name:
+    # Detect whether the checkpoint is object-based, in which case the
+    # var_name_to_prev_var_name dictionary should map variable names to
+    # checkpoint keys. If the user has specified var_name_to_prev_var_name, we
+    # do not override it.
+    var_name_to_prev_var_name = _get_object_checkpoint_renames(
+        ckpt_to_initialize_from, grouped_variables.keys())
+
   warmstarted_count = 0
 
   # Keep track of which var_names in var_name_to_prev_var_name and
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index fa1f370f41e..14fa2430477 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import warm_starting_util as ws_util
+from tensorflow.python.training.tracking import util as tracking_util
 
 ones = init_ops.ones_initializer
 norms = init_ops.truncated_normal_initializer
@@ -1215,6 +1216,26 @@ class WarmStartingUtilTest(test.TestCase):
         self.get_temp_dir(),
         var_name_to_prev_var_name={"y": "y2"})
 
+  def testWarmStartFromObjectBasedCheckpoint(self):
+    prev_val = [[0.5], [1.], [1.5], [2.]]
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+        prev_var = variable_scope.get_variable(
+            "fruit_weights",
+            initializer=prev_val)
+        self.evaluate(variables.global_variables_initializer())
+        # Save object-based checkpoint.
+        tracking_util.Checkpoint(v=prev_var).save(
+            os.path.join(self.get_temp_dir(), "checkpoint"))
+
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+        fruit_weights = variable_scope.get_variable(
+            "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
+        ws_util.warm_start(self.get_temp_dir())
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllClose(prev_val, self.evaluate(fruit_weights))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 4f31498815f..76ba91d632f 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -121,7 +121,7 @@ def as_str_any(value):
 
 @tf_export('compat.path_to_str')
 def path_to_str(path):
-  """Converts input which is a `PathLike` object to `str` type.
+  r"""Converts input which is a `PathLike` object to `str` type.
 
   Converts from any python constant representation of a `PathLike` object to
   a string. If the input is not a `PathLike` object, simply returns the input.
@@ -138,10 +138,10 @@ def path_to_str(path):
 
   Examples:
   ```python3
-  >>> tf.compat.path_to_str('C:\\XYZ\\tensorflow\\./.././tensorflow')
-  'C:\\XYZ\\tensorflow\\./.././tensorflow' # Windows OS
+  >>> tf.compat.path_to_str('C:\XYZ\tensorflow\./.././tensorflow')
+  'C:\XYZ\tensorflow\./.././tensorflow' # Windows OS
   >>> tf.compat.path_to_str(Path('C:\XYZ\tensorflow\./.././tensorflow'))
-  'C:\\XYZ\\tensorflow\\..\\tensorflow' # Windows OS
+  'C:\XYZ\tensorflow\..\tensorflow' # Windows OS
   >>> tf.compat.path_to_str(Path('./corpus'))
   'corpus' # Linux OS
   >>> tf.compat.path_to_str('./.././Corpus')
diff --git a/tensorflow/python/util/function_utils.py b/tensorflow/python/util/function_utils.py
index 84e45bec6fc..d1cc67ce38e 100644
--- a/tensorflow/python/util/function_utils.py
+++ b/tensorflow/python/util/function_utils.py
@@ -56,7 +56,7 @@ def fn_args(fn):
       fn = fn.__call__
     args = tf_inspect.getfullargspec(fn).args
     if _is_bounded_method(fn):
-      args.remove('self')
+      args.pop(0)  # remove `self` or `cls`
   return tuple(args)
 
 
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 5bb548b23be..a5f838b03b9 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -48,7 +48,13 @@ _SHALLOW_TREE_HAS_INVALID_KEYS = (
 
 _STRUCTURES_HAVE_MISMATCHING_TYPES = (
     "The two structures don't have the same sequence type. Input structure has "
-    "type {shallow_type}, while shallow structure has type {input_type}.")
+    "type {input_type}, while shallow structure has type {shallow_type}.")
+
+_STRUCTURES_HAVE_MISMATCHING_LENGTHS = (
+    "The two structures don't have the same sequence length. Input "
+    "structure has length {input_length}, while shallow structure has length "
+    "{shallow_length}."
+)
 
 _INPUT_TREE_SMALLER_THAN_SHALLOW_TREE = (
     "The input_tree has fewer elements than the shallow_tree. Input structure "
@@ -621,8 +627,11 @@ def _yield_flat_up_to(shallow_tree, input_tree, is_seq, path=()):
         yield (leaf_path, leaf_value)
 
 
-def assert_shallow_structure(shallow_tree, input_tree, check_types=True,
-                             expand_composites=False):
+def assert_shallow_structure(shallow_tree,
+                             input_tree,
+                             check_types=True,
+                             expand_composites=False,
+                             check_subtrees_length=True):
   """Asserts that `shallow_tree` is a shallow structure of `input_tree`.
 
   That is, this function tests if the `input_tree` structure can be created from
@@ -638,13 +647,21 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True,
     assert_shallow_structure(shallow_tree, input_tree)
   ```
 
-  The following code will not raise an exception:
+  The following code will raise an exception:
   ```python
     shallow_tree = ["a", "b"]
     input_tree = ["c", ["d", "e"], "f"]
     assert_shallow_structure(shallow_tree, input_tree)
   ```
 
+  The following code will not raise an exception:
+  ```python
+    shallow_tree = ["a", "b"]
+    input_tree = ["c", ["d", "e"], "f"]
+    assert_shallow_structure(shallow_tree, input_tree,
+      check_subtrees_length=False)
+  ```
+
   Args:
     shallow_tree: an arbitrarily nested structure.
     input_tree: an arbitrarily nested structure.
@@ -654,7 +671,10 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True,
       name and _fields attribute to be the same class.
     expand_composites: If true, then composite tensors such as tf.SparseTensor
        and tf.RaggedTensor are expanded into their component tensors.
-
+    check_subtrees_length: if `True` (default) the subtrees `shallow_tree` and
+      `input_tree` have to be the same length. If `False` sequences are treated
+      as key-value like mappings allowing them to be considered as valid
+      subtrees. Note that this may drop parts of the `input_tree`.
   Raises:
     TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
     TypeError: If the sequence types of `shallow_tree` are different from
@@ -692,7 +712,11 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True,
                         "must also be a CompositeTensor.  Input has type: %s." %
                         type(input_tree))
     else:
-      if len(input_tree) < len(shallow_tree):
+      if check_subtrees_length and len(input_tree) != len(shallow_tree):
+        raise ValueError(
+            _STRUCTURES_HAVE_MISMATCHING_LENGTHS.format(
+                input_length=len(input_tree), shallow_length=len(shallow_tree)))
+      elif len(input_tree) < len(shallow_tree):
         raise ValueError(
             _INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
                 input_size=len(input_tree), shallow_size=len(shallow_tree)))
@@ -707,11 +731,12 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True,
                                             _yield_value(input_tree)):
       assert_shallow_structure(shallow_branch, input_branch,
                                check_types=check_types,
-                               expand_composites=expand_composites)
+                               expand_composites=expand_composites,
+                               check_subtrees_length=check_subtrees_length)
 
 
 def flatten_up_to(shallow_tree, input_tree, check_types=True,
-                  expand_composites=False):
+                  expand_composites=False, check_subtrees_length=True):
   """Flattens `input_tree` up to `shallow_tree`.
 
   Any further depth in structure in `input_tree` is retained as elements in the
@@ -764,6 +789,18 @@ def flatten_up_to(shallow_tree, input_tree, check_types=True,
   flatten_up_to([0, 1, 2], [0, 1, 2])  # Output: [0, 1, 2]
   ```
 
+  Non-Full-Subtree case:
+
+  ```python
+    shallow_tree = ["a", "b"]
+    input_tree = ["c", ["d", "e"], "f"]
+    flattened = flatten_up_to(shallow_tree, input_tree,
+      check_subtrees_length=False)
+
+    # Output is:
+    # ["c", ["d", "e"]]
+  ```
+
   Args:
     shallow_tree: a possibly pruned structure of input_tree.
     input_tree: an arbitrarily nested structure or a scalar object.
@@ -772,6 +809,10 @@ def flatten_up_to(shallow_tree, input_tree, check_types=True,
       same type as the corresponding node in input_tree.
     expand_composites: If true, then composite tensors such as tf.SparseTensor
        and tf.RaggedTensor are expanded into their component tensors.
+    check_subtrees_length: if `True` (default) the subtrees `shallow_tree` and
+      `input_tree` have to be the same length. If `False` sequences are treated
+      as key-value like mappings allowing them to be considered as valid
+      subtrees. Note that this may drop parts of the `input_tree`.
 
   Returns:
     A Python list, the partially flattened version of `input_tree` according to
@@ -785,14 +826,20 @@ def flatten_up_to(shallow_tree, input_tree, check_types=True,
       `input_tree`.
   """
   is_seq = is_sequence_or_composite if expand_composites else is_sequence
-  assert_shallow_structure(shallow_tree, input_tree, check_types=check_types,
-                           expand_composites=expand_composites)
+  assert_shallow_structure(shallow_tree,
+                           input_tree,
+                           check_types=check_types,
+                           expand_composites=expand_composites,
+                           check_subtrees_length=check_subtrees_length)
   # Discard paths returned by _yield_flat_up_to.
   return list(v for _, v in _yield_flat_up_to(shallow_tree, input_tree, is_seq))
 
 
-def flatten_with_tuple_paths_up_to(shallow_tree, input_tree, check_types=True,
-                                   expand_composites=False):
+def flatten_with_tuple_paths_up_to(shallow_tree,
+                                   input_tree,
+                                   check_types=True,
+                                   expand_composites=False,
+                                   check_subtrees_length=True):
   """Flattens `input_tree` up to `shallow_tree`.
 
   Any further depth in structure in `input_tree` is retained as elements in the
@@ -872,6 +919,10 @@ def flatten_with_tuple_paths_up_to(shallow_tree, input_tree, check_types=True,
       same type as the corresponding node in input_tree.
     expand_composites: If true, then composite tensors such as tf.SparseTensor
        and tf.RaggedTensor are expanded into their component tensors.
+    check_subtrees_length: if `True` (default) the subtrees `shallow_tree` and
+      `input_tree` have to be the same length. If `False` sequences are treated
+      as key-value like mappings allowing them to be considered as valid
+      subtrees. Note that this may drop parts of the `input_tree`.
 
   Returns:
     A Python list, the partially flattened version of `input_tree` according to
@@ -885,8 +936,11 @@ def flatten_with_tuple_paths_up_to(shallow_tree, input_tree, check_types=True,
       `input_tree`.
   """
   is_seq = is_sequence_or_composite if expand_composites else is_sequence
-  assert_shallow_structure(shallow_tree, input_tree, check_types=check_types,
-                           expand_composites=expand_composites)
+  assert_shallow_structure(shallow_tree,
+                           input_tree,
+                           check_types=check_types,
+                           expand_composites=expand_composites,
+                           check_subtrees_length=check_subtrees_length)
   return list(_yield_flat_up_to(shallow_tree, input_tree, is_seq))
 
 
@@ -1041,17 +1095,27 @@ def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
 
   check_types = kwargs.pop("check_types", True)
   expand_composites = kwargs.pop("expand_composites", False)
+  check_subtrees_length = kwargs.pop("check_subtrees_length", True)
   is_seq = is_sequence_or_composite if expand_composites else is_sequence
 
   for input_tree in inputs:
-    assert_shallow_structure(shallow_tree, input_tree, check_types=check_types,
-                             expand_composites=expand_composites)
+    assert_shallow_structure(
+        shallow_tree,
+        input_tree,
+        check_types=check_types,
+        expand_composites=expand_composites,
+        check_subtrees_length=check_subtrees_length)
 
   # Flatten each input separately, apply the function to corresponding elements,
   # then repack based on the structure of the first input.
-  flat_value_lists = [flatten_up_to(shallow_tree, input_tree, check_types,
-                                    expand_composites=expand_composites)
-                      for input_tree in inputs]
+  flat_value_lists = [
+      flatten_up_to(  # pylint: disable=g-complex-comprehension
+          shallow_tree,
+          input_tree,
+          check_types,
+          expand_composites=expand_composites,
+          check_subtrees_length=check_subtrees_length) for input_tree in inputs
+  ]
   flat_path_list = [path for path, _
                     in _yield_flat_up_to(shallow_tree, inputs[0], is_seq)]
   results = [func(*args, **kwargs) for args in zip(flat_path_list,
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 9a8f82e8d48..693a2223dce 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -531,12 +531,12 @@ class NestTest(parameterized.TestCase, test.TestCase):
   def testAssertShallowStructure(self):
     inp_ab = ["a", "b"]
     inp_abc = ["a", "b", "c"]
-    with self.assertRaisesWithLiteralMatch(
+    with self.assertRaisesWithLiteralMatch(  # pylint: disable=g-error-prone-assert-raises
         ValueError,
-        nest._INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
-            shallow_size=len(inp_abc),
-            input_size=len(inp_ab))):
-      nest.assert_shallow_structure(shallow_tree=inp_abc, input_tree=inp_ab)
+        nest._STRUCTURES_HAVE_MISMATCHING_LENGTHS.format(
+            input_length=len(inp_ab),
+            shallow_length=len(inp_abc))):
+      nest.assert_shallow_structure(inp_abc, inp_ab)
 
     inp_ab1 = [(1, 1), (2, 2)]
     inp_ab2 = [[1, 1], [2, 2]]
@@ -707,10 +707,18 @@ class NestTest(parameterized.TestCase, test.TestCase):
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, shallow_tree)
 
+    input_tree = [(1,), (2,), 3]
+    shallow_tree = [(1,), (2,)]
+    expected_message = nest._STRUCTURES_HAVE_MISMATCHING_LENGTHS.format(
+        input_length=len(input_tree), shallow_length=len(shallow_tree))
+    with self.assertRaisesRegexp(ValueError, expected_message):  # pylint: disable=g-error-prone-assert-raises
+      nest.assert_shallow_structure(shallow_tree, input_tree)
+
   def testFlattenWithTuplePathsUpTo(self):
-    def get_paths_and_values(shallow_tree, input_tree):
-      path_value_pairs = nest.flatten_with_tuple_paths_up_to(shallow_tree,
-                                                             input_tree)
+    def get_paths_and_values(shallow_tree, input_tree,
+                             check_subtrees_length=True):
+      path_value_pairs = nest.flatten_with_tuple_paths_up_to(
+          shallow_tree, input_tree, check_subtrees_length=check_subtrees_length)
       paths = [p for p, _ in path_value_pairs]
       values = [v for _, v in path_value_pairs]
       return paths, values
@@ -837,8 +845,17 @@ class NestTest(parameterized.TestCase, test.TestCase):
     # Test case where len(shallow_tree) < len(input_tree)
     input_tree = {"a": "A", "b": "B", "c": "C"}
     shallow_tree = {"a": 1, "c": 2}
+
+    with self.assertRaisesWithLiteralMatch(  # pylint: disable=g-error-prone-assert-raises
+        ValueError,
+        nest._STRUCTURES_HAVE_MISMATCHING_LENGTHS.format(
+            input_length=len(input_tree),
+            shallow_length=len(shallow_tree))):
+      get_paths_and_values(shallow_tree, input_tree)
+
     (flattened_input_tree_paths,
-     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree,
+                                                  check_subtrees_length=False)
     (flattened_shallow_tree_paths,
      flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
     self.assertEqual(flattened_input_tree_paths, [("a",), ("c",)])
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index a2f999c57f1..3f796b87f9c 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -115,7 +115,7 @@ static PyObject* CheckpointReader_GetTensor(
       Set_TF_Status_from_Status(status, s);
     }
   }
-  return py_obj;
+  return PyArray_Return(reinterpret_cast<PyArrayObject*>(py_obj));
 }
 %}
 
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 103f99ce81e..ee2df3cb21b 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -257,12 +257,12 @@ def getfullargspec(obj):
   return _getfullargspec(target)
 
 
-def getcallargs(func, *positional, **named):
+def getcallargs(*func_and_positional, **named):
   """TFDecorator-aware replacement for inspect.getcallargs.
 
   Args:
-    func: A callable, possibly decorated
-    *positional: The positional arguments that would be passed to `func`.
+    *func_and_positional: A callable, possibly decorated, followed by any
+      positional arguments that would be passed to `func`.
     **named: The named argument dictionary that would be passed to `func`.
 
   Returns:
@@ -273,6 +273,8 @@ def getcallargs(func, *positional, **named):
   it. If no attached decorators modify argspec, the final unwrapped target's
   argspec will be used.
   """
+  func = func_and_positional[0]
+  positional = func_and_positional[1:]
   argspec = getfullargspec(func)
   call_args = named.copy()
   this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
@@ -285,6 +287,10 @@ def getcallargs(func, *positional, **named):
     for arg, value in zip(argspec.args[-default_count:], argspec.defaults):
       if arg not in call_args:
         call_args[arg] = value
+  if argspec.kwonlydefaults is not None:
+    for k, v in argspec.kwonlydefaults.items():
+      if k not in call_args:
+        call_args[k] = v
   return call_args
 
 
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 68ee86703f6..44afdd262d2 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -594,6 +594,28 @@ class TfInspectGetCallArgsTest(test.TestCase):
 
     self.assertEqual({}, tf_inspect.getcallargs(empty))
 
+  def testClashingParameterNames(self):
+
+    def func(positional, func=1, func_and_positional=2, kwargs=3):
+      return positional, func, func_and_positional, kwargs
+
+    kwargs = {}
+    self.assertEqual(
+        tf_inspect.getcallargs(func, 0, **kwargs), {
+            'positional': 0,
+            'func': 1,
+            'func_and_positional': 2,
+            'kwargs': 3
+        })
+    kwargs = dict(func=4, func_and_positional=5, kwargs=6)
+    self.assertEqual(
+        tf_inspect.getcallargs(func, 0, **kwargs), {
+            'positional': 0,
+            'func': 4,
+            'func_and_positional': 5,
+            'kwargs': 6
+        })
+
   def testUnboundFuncWithOneParamPositional(self):
 
     def func(a):
diff --git a/tensorflow/python/util/tf_stack.py b/tensorflow/python/util/tf_stack.py
index b32bbd64df1..5603989a0d1 100644
--- a/tensorflow/python/util/tf_stack.py
+++ b/tensorflow/python/util/tf_stack.py
@@ -19,8 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import inspect
 import linecache
 import sys
+import threading
 
 # Names for indices into TF traceback tuples.
 TB_FILENAME = 0
@@ -29,6 +31,77 @@ TB_FUNCNAME = 2
 TB_CODEDICT = 3  # Dictionary of Python interpreter state.
 
 
+stacks = threading.local()
+
+
+def _source_mappers():
+  if not hasattr(stacks, 'source_mapper'):
+    stacks.source_mapper = []
+  return stacks.source_mapper
+
+
+def _source_filters():
+  if not hasattr(stacks, 'source_filter'):
+    stacks.source_filter = []
+  return stacks.source_filter
+
+
+class StackTraceMapper(object):
+  """Allows remapping traceback information to different source code."""
+
+  def __enter__(self):
+    _source_mappers().append(self)
+    return self
+
+  def __exit__(self, unused_type, unused_value, unused_traceback):
+    assert _source_mappers()[-1] is self, 'Concurrent access?'
+    _source_mappers().pop()
+
+  def map(self, filename, lineno, name):
+    raise NotImplementedError('subclasses need to override this')
+
+
+class StackTraceFilter(object):
+  """Allows filtering traceback information by removing superfluous frames."""
+
+  def __enter__(self):
+    _source_filters().append(self)
+    return self
+
+  def __exit__(self, unused_type, unused_value, unused_traceback):
+    assert _source_filters()[-1] is self, 'Concurrent access?'
+    _source_filters().pop()
+
+  def filter(self, filename, lineno, name):
+    raise NotImplementedError('subclasses need to override this')
+
+
+class CurrentModuleFilter(StackTraceFilter):
+  """Filters stack frames from the module where this is used (best effort)."""
+
+  def __init__(self):
+    filter_filename = None
+    outer_f = None
+    f = inspect.currentframe()
+    try:
+      if f is not None:
+        # The current frame is __init__. The first outer frame should be the
+        # caller.
+        outer_f = f.f_back
+        if outer_f is not None:
+          filter_filename = inspect.getsourcefile(outer_f)
+      self._filename = filter_filename
+    finally:
+      # Avoid reference cycles, see:
+      # https://docs.python.org/3.7/library/inspect.html#the-interpreter-stack
+      del f
+      del outer_f
+
+  def should_remove(self, filename, lineno, name):
+    del lineno, name
+    return filename == self._filename
+
+
 def extract_stack(limit=None):
   """A lightweight, extensible re-implementation of traceback.extract_stack.
 
@@ -61,9 +134,23 @@ def extract_stack(limit=None):
     name = co.co_name
     frame_globals = f.f_globals
     func_start_lineno = co.co_firstlineno
-    ret.append((filename, lineno, name, frame_globals, func_start_lineno))
-    length += 1
+
+    for mapper in _source_mappers():
+      # TODO(mdan): Show some indication that the frame was translated.
+      filename, lineno, name = mapper.map(filename, lineno, name)
+
+    keep = True
+    if ret:  # Never filter the innermost frame.
+      keep = not any(
+          f.should_remove(filename, lineno, name) for f in _source_filters())
+    if keep:
+      ret.append((filename, lineno, name, frame_globals, func_start_lineno))
+      length += 1
+
     f = f.f_back
+
+  # TODO(mdan): Also add a truncation mechanism.
+
   ret.reverse()
   return ret
 
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index a09091626e8..e43aaf526cd 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -687,6 +687,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tf_allocator_adapter",
+    srcs = ["tf_allocator_adapter.cc"],
+    hdrs = ["tf_allocator_adapter.h"],
+    deps = [
+        ":device_memory",
+        ":device_memory_allocator",
+        ":platform",
+        "//tensorflow/core:allocator",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 tf_cc_test(
     name = "stream_test",
     size = "small",
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 3be9bd59495..241085dddc5 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -1,8 +1,6 @@
 # Description:
 #   CUDA-platform specific StreamExecutor support code.
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
     "//tensorflow/stream_executor:build_defs.bzl",
@@ -19,15 +17,16 @@ load(
 )
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
 package_group(
     name = "friends",
     packages = stream_executor_friends(),
 )
 
-package(
-    default_visibility = [":friends"],
-)
-
 # Filegroup used to collect source files for the dependency check.
 filegroup(
     name = "c_srcs",
@@ -95,6 +94,24 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "ptxas_utils",
+    srcs = if_cuda_is_configured(["ptxas_utils.cc"]),
+    hdrs = if_cuda_is_configured(["ptxas_utils.h"]),
+    deps = if_cuda_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        ":cuda_driver",
+        "@com_google_absl//absl/types:span",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
+    ]) + ["@com_google_absl//absl/container:flat_hash_map"],
+)
+
 cc_library(
     name = "cuda_driver",
     srcs = if_cuda_is_configured(["cuda_driver.cc"]),
@@ -195,6 +212,9 @@ cc_library(
     srcs = if_cuda_is_configured(["cublas_stub.cc"]),
     textual_hdrs = glob(["cublas_*.inc"]),
     deps = if_cuda_is_configured([
+        # LINT.IfChange
+        "@local_config_cuda//cuda:cublas_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
@@ -225,6 +245,9 @@ cc_library(
         ":cuda_helpers",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
+        # LINT.IfChange
+        "@local_config_cuda//cuda:cublas_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor",
@@ -326,6 +349,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor:dnn",
@@ -364,6 +388,7 @@ cc_library(
     name = "curand_plugin",
     srcs = if_cuda_is_configured(["cuda_rng.cc"]),
     hdrs = if_cuda_is_configured(["cuda_rng.h"]),
+    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_activation",
         ":cuda_gpu_executor",
@@ -401,6 +426,9 @@ cc_library(
     srcs = if_cuda_is_configured(["cusolver_stub.cc"]),
     textual_hdrs = ["cusolver_dense_10_0.inc"],
     deps = if_cuda_is_configured([
+        # LINT.IfChange
+        "@local_config_cuda//cuda:cublas_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 8fc0549b01b..e43830a38a6 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1500,6 +1500,55 @@ port::StatusOr<DeviceMemory<uint8>> CreateRnnWorkspace(
   return workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
 }
 
+#if CUDNN_VERSION >= 7402
+port::StatusOr<DeviceMemory<uint8>> CreateBatchNormForwardWorkspace(
+    Stream* stream, const CudnnHandle& cudnn, const cudnnBatchNormMode_t& mode,
+    const cudnnBatchNormOps_t& bn_ops,
+    const CudnnTensorDescriptor& x_descriptor,
+    const CudnnTensorDescriptor& scale_offset_descriptor,
+    ScratchAllocator* workspace_allocator) {
+  // Query the workspace size.
+  size_t workspace_size_in_bytes = 0;
+  RETURN_IF_CUDNN_ERROR(
+      cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+          /*handle=*/cudnn.handle(), /*mode=*/mode, /*bnOps=*/bn_ops,
+          /*xDesc=*/x_descriptor.handle(), /*zDesc=*/nullptr,
+          /*yDesc=*/x_descriptor.handle(),
+          /*bnScaleBiasMeanVarDesc=*/scale_offset_descriptor.handle(),
+          /*activationDesc=*/nullptr,
+          /*sizeInBytes=*/&workspace_size_in_bytes));
+  // Allocate the workspace.
+  if (workspace_size_in_bytes == 0) {
+    return DeviceMemory<uint8>();
+  }
+  return workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+}
+
+port::StatusOr<DeviceMemory<uint8>> CreateBatchNormBackwardWorkspace(
+    Stream* stream, const CudnnHandle& cudnn, const cudnnBatchNormMode_t& mode,
+    const cudnnBatchNormOps_t& bn_ops,
+    const CudnnTensorDescriptor& x_descriptor,
+    const CudnnTensorDescriptor& scale_offset_descriptor,
+    ScratchAllocator* workspace_allocator) {
+  // Query the workspace size.
+  size_t workspace_size_in_bytes = 0;
+  RETURN_IF_CUDNN_ERROR(cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+      /*handle=*/cudnn.handle(), /*mode=*/mode, /*bnOps=*/bn_ops,
+      /*xDesc=*/x_descriptor.handle(),
+      /*yDesc=*/x_descriptor.handle(),
+      /*dyDesc=*/x_descriptor.handle(),
+      /*dzDesc=*/nullptr,
+      /*dxDesc=*/x_descriptor.handle(),
+      /*dBnScaleBiasDesc=*/scale_offset_descriptor.handle(),
+      /*activationDesc=*/nullptr, /*sizeInBytes=*/&workspace_size_in_bytes));
+  // Allocate the workspace.
+  if (workspace_size_in_bytes == 0) {
+    return DeviceMemory<uint8>();
+  }
+  return workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
+}
+#endif
+
 }  // namespace
 
 template <class T>
@@ -3187,6 +3236,8 @@ bool CudnnSupport::DoBatchNormalizationForward(
     DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
     DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
     DeviceMemory<float>* saved_inv_var, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
     std::function<const DeviceMemory<float>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
   return IsStatusOk(
@@ -3194,7 +3245,8 @@ bool CudnnSupport::DoBatchNormalizationForward(
           stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale,
           offset, estimated_mean, estimated_variance, x_desc, scale_offset_desc,
           epsilon, y, batch_mean, batch_var, saved_mean, saved_inv_var,
-          is_training, std::move(var_to_inv_var), std::move(inv_var_to_var)),
+          is_training, reserve_space_allocator, workspace_allocator,
+          std::move(var_to_inv_var), std::move(inv_var_to_var)),
       /*report_error=*/true);
 }
 
@@ -3208,6 +3260,8 @@ bool CudnnSupport::DoBatchNormalizationForward(
     DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
     DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
     DeviceMemory<float>* saved_inv_var, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
     std::function<const DeviceMemory<float>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
   return IsStatusOk(
@@ -3215,7 +3269,8 @@ bool CudnnSupport::DoBatchNormalizationForward(
           stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset,
           estimated_mean, estimated_variance, x_desc, scale_offset_desc,
           epsilon, y, batch_mean, batch_var, saved_mean, saved_inv_var,
-          is_training, std::move(var_to_inv_var), std::move(inv_var_to_var)),
+          is_training, reserve_space_allocator, workspace_allocator,
+          std::move(var_to_inv_var), std::move(inv_var_to_var)),
       /*report_error=*/true);
 }
 
@@ -3230,7 +3285,9 @@ port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<T>* y, DeviceMemory<U>* batch_mean, DeviceMemory<U>* batch_var,
     DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
-    bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
+    bool is_training, ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
+    std::function<const DeviceMemory<U>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
   CudnnTensorDescriptor x_descriptor(x_desc, ToCudnnDataType(input_data_type));
   CudnnTensorDescriptor scale_offset_descriptor(
@@ -3245,6 +3302,29 @@ port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
   float zero = 0.0;
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
+  DeviceMemory<uint8> workspace;
+  DeviceMemory<uint8> reserve_space;
+#if CUDNN_VERSION >= 7402
+  const cudnnBatchNormOps_t bn_ops = CUDNN_BATCHNORM_OPS_BN;
+  if (reserve_space_allocator != nullptr && workspace_allocator != nullptr) {
+    SE_ASSIGN_OR_RETURN(workspace,
+                        CreateBatchNormForwardWorkspace(
+                            stream, cudnn, mode, bn_ops, x_descriptor,
+                            scale_offset_descriptor, workspace_allocator))
+    if (is_training) {
+      size_t reserve_space_size_in_bytes = 0;
+      RETURN_IF_CUDNN_ERROR(
+          cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+              /*handle=*/cudnn.handle(), /*mode=*/mode, /*bnOps=*/bn_ops,
+              /*activationDesc=*/nullptr, /*xDesc=*/x_descriptor.handle(),
+              /*sizeInBytes=*/&reserve_space_size_in_bytes));
+      SE_ASSIGN_OR_RETURN(reserve_space,
+                          reserve_space_allocator->AllocateBytes(
+                              stream, reserve_space_size_in_bytes));
+    }
+  }
+#endif
+
   if (is_training) {
     CHECK_EQ(batch_mean->is_null(), batch_var->is_null())
         << "batch_mean and batch_var must both be null or both be non-null";
@@ -3261,12 +3341,46 @@ port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
       batch_var_opaque = nullptr;
     }
 
-    RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardTraining(
-        cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
-        x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
-        scale.opaque(), offset.opaque(), 1.0, batch_mean_opaque,
-        batch_var_opaque, epsilon, saved_mean->opaque(),
-        saved_inv_var->opaque()));
+    bool called = false;
+#if CUDNN_VERSION >= 7402
+    if (reserve_space_allocator != nullptr && workspace_allocator != nullptr) {
+      called = true;
+      RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardTrainingEx(
+          /*handle=*/cudnn.handle(),
+          /*mode=*/mode,
+          /*bnOps=*/bn_ops,
+          /*alpha=*/&one,
+          /*beta=*/&zero,
+          /*xDesc=*/x_descriptor.handle(),
+          /*xData=*/x.opaque(),
+          /*zDesc=*/nullptr,
+          /*zData=*/nullptr,
+          /*yDesc=*/x_descriptor.handle(),
+          /*yData=*/y->opaque(),
+          /*bnScaleBiasMeanVarDesc=*/scale_offset_descriptor.handle(),
+          /*bnScale=*/scale.opaque(),
+          /*bnBias=*/offset.opaque(),
+          /*exponentialAverageFactor=*/1.0,
+          /*resultRunningMean=*/batch_mean_opaque,
+          /*resultRunningVariance=*/batch_var_opaque,
+          /*epsilon=*/epsilon,
+          /*resultSaveMean=*/saved_mean->opaque(),
+          /*resultSaveInvVariance=*/saved_inv_var->opaque(),
+          /*activationDesc=*/nullptr,
+          /*workspace=*/workspace.opaque(),
+          /*workSpaceSizeInBytes=*/workspace.size(),
+          /*reserveSpace=*/reserve_space.opaque(),
+          /*reserveSpaceSizeInBytes=*/reserve_space.size()));
+    }
+#endif
+    if (!called) {
+      RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardTraining(
+          cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
+          x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
+          scale.opaque(), offset.opaque(), 1.0, batch_mean_opaque,
+          batch_var_opaque, epsilon, saved_mean->opaque(),
+          saved_inv_var->opaque()));
+    }
   } else {
     const void* maybe_inv_var = estimated_variance.opaque();
     RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardInference(
@@ -3285,11 +3399,14 @@ bool CudnnSupport::DoBatchNormalizationBackward(
     const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
-    DeviceMemory<float>* offset_backprop) {
+    DeviceMemory<float>* offset_backprop,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
   return IsStatusOk(DoBatchNormalizationBackwardImpl(
                         stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop,
                         x, scale, mean, inv_var, x_desc, scale_offset_desc,
-                        epsilon, x_backprop, scale_backprop, offset_backprop),
+                        epsilon, x_backprop, scale_backprop, offset_backprop,
+                        reserve_space_data, workspace_allocator),
                     /*report_error=*/true);
 }
 
@@ -3300,11 +3417,14 @@ bool CudnnSupport::DoBatchNormalizationBackward(
     const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop,
-    DeviceMemory<float>* offset_backprop) {
+    DeviceMemory<float>* offset_backprop,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
   return IsStatusOk(DoBatchNormalizationBackwardImpl(
                         stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop,
                         x, scale, mean, inv_var, x_desc, scale_offset_desc,
-                        epsilon, x_backprop, scale_backprop, offset_backprop),
+                        epsilon, x_backprop, scale_backprop, offset_backprop,
+                        reserve_space_data, workspace_allocator),
                     /*report_error=*/true);
 }
 
@@ -3316,7 +3436,8 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
     const DeviceMemory<U>& inv_var, const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
-    DeviceMemory<U>* offset_backprop) {
+    DeviceMemory<U>* offset_backprop, DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
   CudnnTensorDescriptor x_descriptor(
       x_desc, static_cast<cudnnDataType_t>(cudnn_input_type));
   CudnnTensorDescriptor scale_offset_descriptor(
@@ -3332,13 +3453,58 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationBackward(
-      cudnn.handle(), mode, &one, &zero, &one, &zero, x_descriptor.handle(),
-      x.opaque(), x_descriptor.handle(), y_backprop.opaque(),
-      x_descriptor.handle(), x_backprop->opaque(),
-      scale_offset_descriptor.handle(), scale.opaque(),
-      scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
-      mean.opaque(), inv_var.opaque()));
+  bool called = false;
+#if CUDNN_VERSION >= 7402
+  if (reserve_space_data != nullptr && workspace_allocator != nullptr) {
+    called = true;
+    const cudnnBatchNormOps_t bn_ops = CUDNN_BATCHNORM_OPS_BN;
+    SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> workspace,
+                        CreateBatchNormBackwardWorkspace(
+                            stream, cudnn, mode, bn_ops, x_descriptor,
+                            scale_offset_descriptor, workspace_allocator))
+    RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationBackwardEx(
+        /*handle=*/cudnn.handle(),
+        /*mode=*/mode,
+        /*bnOps=*/bn_ops,
+        /*alphaDataDiff=*/&one,
+        /*betaDataDiff=*/&zero,
+        /*alphaParamDiff=*/&one,
+        /*betaParamDiff=*/&zero,
+        /*xDesc=*/x_descriptor.handle(),
+        /*xData=*/x.opaque(),
+        /*yDesc=*/nullptr,
+        /*yData=*/nullptr,
+        /*dyDesc=*/x_descriptor.handle(),
+        /*dyData=*/y_backprop.opaque(),
+        /*dzDesc=*/nullptr,
+        /*dzData=*/nullptr,
+        /*dxDesc=*/x_descriptor.handle(),
+        /*dxData=*/x_backprop->opaque(),
+        /*dBnScaleBiasDesc=*/scale_offset_descriptor.handle(),
+        /*bnScaleData=*/scale.opaque(),
+        /*bnBiasData=*/nullptr,
+        /*dBnScaleData=*/scale_backprop->opaque(),
+        /*dBnBiasData=*/offset_backprop->opaque(),
+        /*epsilon=*/epsilon,
+        /*savedMean=*/mean.opaque(),
+        /*savedInvVariance=*/inv_var.opaque(),
+        /*activationDesc=*/nullptr,
+        /*workspace=*/workspace.opaque(),
+        /*workSpaceSizeInBytes=*/workspace.size(),
+        /*reserveSpace=*/reserve_space_data->opaque(),
+        /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
+  }
+#endif
+  if (!called) {
+    RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationBackward(
+        cudnn.handle(), mode, &one, &zero, &one, &zero, x_descriptor.handle(),
+        x.opaque(), x_descriptor.handle(), y_backprop.opaque(),
+        x_descriptor.handle(), x_backprop->opaque(),
+        scale_offset_descriptor.handle(), scale.opaque(),
+        scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
+        mean.opaque(), inv_var.opaque()));
+  }
+
   return port::Status::OK();
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 80fc1ae1794..75f9d85252d 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -223,6 +223,8 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
       DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
       DeviceMemory<float>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
       std::function<const DeviceMemory<float>&()> var_to_inv_var,
       std::function<void()> inv_var_to_var) override;
 
@@ -236,6 +238,8 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
       DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
       DeviceMemory<float>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
       std::function<const DeviceMemory<float>&()> var_to_inv_var,
       std::function<void()> inv_var_to_var) override;
 
@@ -246,7 +250,9 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
-      DeviceMemory<float>* offset_backprop) override;
+      DeviceMemory<float>* offset_backprop,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
 
   bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
@@ -255,8 +261,9 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half>* x_backprop,
-      DeviceMemory<float>* scale_backprop,
-      DeviceMemory<float>* offset_backprop) override;
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
 
   port::Status DoConvolve(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
@@ -564,6 +571,8 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T>* y, DeviceMemory<U>* batch_mean,
       DeviceMemory<U>* batch_var, DeviceMemory<U>* saved_mean,
       DeviceMemory<U>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
       std::function<const DeviceMemory<U>&()> var_to_inv_var,
       std::function<void()> inv_var_to_var);
 
@@ -575,7 +584,8 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<U>& inv_var, const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
-      DeviceMemory<U>* offset_backprop);
+      DeviceMemory<U>* offset_backprop, DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator);
 
   template <typename ElementType, typename BiasType, typename ScaleType>
   port::Status DoFusedConvolveImpl(
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index cdd5ae7e57f..d028452e23b 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
diff --git a/tensorflow/stream_executor/cuda/ptxas_utils.cc b/tensorflow/stream_executor/cuda/ptxas_utils.cc
new file mode 100644
index 00000000000..7fb4aa2abad
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/ptxas_utils.cc
@@ -0,0 +1,216 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/ptxas_utils.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace cuda {
+
+// Prints a warning if the ptxas at ptxas_path has known bugs.
+//
+// Only prints a warning the first time it's called for a particular value of
+// ptxas_path.
+//
+// Locks on entry.
+static void WarnIfBadPtxasVersion(const string& ptxas_path) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) =
+      new std::unordered_set<string>();
+
+  tensorflow::mutex_lock lock(mu);
+  if (!seen_ptxas_paths->insert(ptxas_path).second) {
+    // Already checked this ptx binary, nothing to do.
+    return;
+  }
+
+  tensorflow::SubProcess ptxas;
+  ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
+  ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
+  if (!ptxas.Start()) {
+    LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
+    return;
+  }
+
+  string out;
+  int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
+                                    /*stderr_output=*/nullptr);
+  if (exit_code != 0) {
+    LOG(WARNING) << "Running " << ptxas_path << " --version returned "
+                 << exit_code;
+    return;
+  }
+
+  int64 vmaj, vmin, vdot;
+  string vmaj_str, vmin_str, vdot_str;
+  if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
+                         &vmin_str, &vdot_str) ||
+      !absl::SimpleAtoi(vmaj_str, &vmaj) ||
+      !absl::SimpleAtoi(vmin_str, &vmin) ||
+      !absl::SimpleAtoi(vdot_str, &vdot)) {
+    LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
+                 << " --version:\n"
+                 << out;
+    return;
+  }
+
+  // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
+  // PTX 6.0.  An older ptxas will just fail to compile any of our code.
+  //
+  // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
+  // address calculations with large offsets (e.g. "load ptr + large_constant"),
+  // b/70245379.
+  //
+  // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
+  // that appears related to address calculations, b/111107644.  ptxas 9.2.88
+  // appears to work, as far as we can tell.
+  if (vmaj < 9) {
+    LOG(ERROR)
+        << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
+           "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
+           "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
+           "binary is sufficient.";
+  } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
+    LOG(WARNING)
+        << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
+        << vdot
+        << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
+           "miscompile XLA code, leading to incorrect results or "
+           "invalid-address errors.\n\nYou do not need to update to CUDA "
+           "9.2.88; cherry-picking the ptxas binary is sufficient.";
+  }
+}
+
+port::StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
+    int device_ordinal, const char* ptx,
+    PtxCompilationOptions compilation_options) {
+  using PtxCacheKey =
+      std::tuple<int, std::string, PtxCompilationOptions::PtxOptionsTuple>;
+  static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
+  static auto& ptx_cache GUARDED_BY(ptx_cache_mutex) =
+      *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>();
+
+  tensorflow::mutex_lock lock(ptx_cache_mutex);
+  PtxCacheKey cache_key{device_ordinal, std::string(ptx),
+                        compilation_options.ToTuple()};
+  auto it = ptx_cache.find(cache_key);
+  if (it == ptx_cache.end()) {
+    TF_ASSIGN_OR_RETURN(std::vector<uint8> compiled,
+                        CompilePtx(device_ordinal, ptx, compilation_options));
+    it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
+  }
+
+  CHECK(it != ptx_cache.end());
+  const std::vector<uint8>& compiled = it->second;
+  return absl::MakeSpan(compiled);
+}
+
+port::StatusOr<std::vector<uint8>> CompilePtx(int device_ordinal,
+                                              const char* ptx_contents,
+                                              PtxCompilationOptions options) {
+  gpu::GpuDeviceHandle handle;
+  TF_RETURN_IF_ERROR(CUDADriver::GetDevice(device_ordinal, &handle));
+  int cc_major;
+  int cc_minor;
+  TF_RETURN_IF_ERROR(
+      CUDADriver::GetComputeCapability(&cc_major, &cc_minor, handle));
+
+  string ptxas_path;
+  auto env = tensorflow::Env::Default();
+  for (const string& cuda_root :
+       tensorflow::CandidateCudaRoots(options.preferred_cuda_dir)) {
+    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
+    VLOG(2) << "Looking for ptxas at " << ptxas_path;
+    if (env->FileExists(ptxas_path).ok()) {
+      break;
+    }
+  }
+  TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
+  VLOG(2) << "Using ptxas at " << ptxas_path;
+
+  WarnIfBadPtxasVersion(ptxas_path);
+
+  // Write ptx into a temporary file.
+  string ptx_path;
+  if (!env->LocalTempFilename(&ptx_path)) {
+    return port::InternalError("couldn't get temp PTX file name");
+  }
+  auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
+    TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
+  });
+
+  TF_RETURN_IF_ERROR(
+      tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
+  VLOG(2) << "ptx written to: " << ptx_path;
+
+  // Invoke ptxas and collect its output.
+  string cubin_path;
+  if (!env->LocalTempFilename(&cubin_path)) {
+    return port::InternalError("couldn't get temp CUBIN file name");
+  }
+  auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
+    // CUBIN file may never be created, so the failure to delete it should not
+    // produce TF error.
+    tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
+  });
+  tensorflow::SubProcess ptxas_info_dumper;
+  std::vector<string> ptxas_args = {
+      ptxas_path, ptx_path, "-o", cubin_path,
+      absl::StrCat("-arch=sm_", cc_major, cc_minor)};
+  if (VLOG_IS_ON(2)) {
+    ptxas_args.push_back("-v");
+  }
+  if (options.disable_ptxas_optimizations) {
+    ptxas_args.push_back("-O0");
+  }
+  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
+  ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
+                                     tensorflow::ACTION_PIPE);
+  if (!ptxas_info_dumper.Start()) {
+    return port::InternalError("Failed to launch ptxas");
+  }
+  string stderr_output;
+  int exit_status = ptxas_info_dumper.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
+  if (exit_status != 0) {
+    return port::InternalError(
+        absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
+                        exit_status, stderr_output));
+  }
+
+  // Read in the result of compilation and return it as a byte vector.
+  string cubin;
+  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                                  cubin_path, &cubin));
+  std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
+  return cubin_vector;
+}
+
+}  // namespace cuda
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/ptxas_utils.h b/tensorflow/stream_executor/cuda/ptxas_utils.h
new file mode 100644
index 00000000000..c7d762a149a
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/ptxas_utils.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_PTXAS_UTILS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_PTXAS_UTILS_H_
+
+#include <string>
+
+#include "absl/types/span.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+namespace cuda {
+
+// Compilation options for compiling ptxas.
+struct PtxCompilationOptions {
+  bool disable_ptxas_optimizations;
+
+  // CUDA directory which would be searched first.
+  std::string preferred_cuda_dir;
+
+  explicit PtxCompilationOptions(bool disable_ptxas_optimizations = false,
+                                 absl::string_view preferred_cuda_dir = "")
+      : disable_ptxas_optimizations(disable_ptxas_optimizations),
+        preferred_cuda_dir(preferred_cuda_dir) {}
+
+  using PtxOptionsTuple = std::tuple<bool, std::string>;
+
+  PtxOptionsTuple ToTuple() {
+    return std::make_tuple(disable_ptxas_optimizations, preferred_cuda_dir);
+  }
+};
+
+// Compiles the given PTX string using ptxas and returns the resulting machine
+// code (i.e. a cubin) as a byte array.
+//
+// compile_ptx_options is used to query for the CUDA location in case it is
+// customized in a passed flag, and for controlling ptxas optimizations.
+port::StatusOr<std::vector<uint8>> CompilePtx(int device_ordinal,
+                                              const char* ptx_contents,
+                                              PtxCompilationOptions options);
+
+// Same as CompilePtx, but caches the result, and returns unowned view of
+// the compiled binary.
+//
+// A copy of the string provided in ptx will be made.
+port::StatusOr<absl::Span<const uint8>> CompilePtxOrGetCached(
+    int device_ordinal, const char* ptx,
+    PtxCompilationOptions compilation_options);
+
+}  // namespace cuda
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_PTXAS_UTILS_H_
diff --git a/tensorflow/stream_executor/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h
index 0672aee09b9..37db56cc4ba 100644
--- a/tensorflow/stream_executor/device_memory_allocator.h
+++ b/tensorflow/stream_executor/device_memory_allocator.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -80,13 +82,13 @@ class ScopedDeviceMemory {
 
   // Releases the memory that was provided in the constructor, through the
   // "parent" StreamExecutor.
-  ~ScopedDeviceMemory() { Free(); }
+  ~ScopedDeviceMemory() { TF_CHECK_OK(Free()); }
 
   // Moves ownership of the memory from other to this object.
   //
   // Postcondition: other == nullptr.
   ScopedDeviceMemory &operator=(ScopedDeviceMemory &&other) {
-    Free();
+    TF_CHECK_OK(Free());
     wrapped_ = other.Release();
     allocator_ = other.allocator_;
     device_ordinal_ = other.device_ordinal_;
@@ -132,7 +134,7 @@ class ScopedDeviceMemory {
   int device_ordinal() const { return device_ordinal_; }
 
   // Frees the existing memory, resets the wrapped memory to null.
-  void Free();
+  port::Status Free();
 
  private:
   DeviceMemory<ElemT> wrapped_;       // Value we wrap with scoped-release.
@@ -200,13 +202,20 @@ class DeviceMemoryAllocator {
 
 // Default memory allocator for a platform which uses
 // StreamExecutor::Allocate/Deallocate.
+//
+// Holds a mapping from device ordinals
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
+  // Create an allocator supporting a single device, corresponding to the passed
+  // executor.
   explicit StreamExecutorMemoryAllocator(StreamExecutor *executor);
 
+  // Create an allocator supporting multiple stream executors.
+  //
+  // Precondition: all stream_executors have different device ordinals.
   StreamExecutorMemoryAllocator(
-      const Platform* platform,
-      absl::Span<StreamExecutor* const> stream_executors);
+      const Platform *platform,
+      absl::Span<StreamExecutor *const> stream_executors);
 
   port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
                                               bool retry_on_failure) override;
@@ -221,22 +230,19 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  private:
   port::StatusOr<StreamExecutor*> GetStreamExecutor(int device_ordinal);
 
-  // A vector indexed by device ordinal of StreamExecutors for each device of
-  // the allocator's platform type. If an element is nullptr, then the device
-  // with the respective device ordinal is not supported by XLA.
-  std::vector<StreamExecutor*> stream_executors_;
+  // Available stream executors. Each stream executor has a different device
+  // ordinal.
+  std::vector<StreamExecutor *> stream_executors_;
 };
 
 template <typename ElemT>
-void ScopedDeviceMemory<ElemT>::Free() {
+port::Status ScopedDeviceMemory<ElemT>::Free() {
   if (!wrapped_.is_null()) {
-    DCHECK(allocator_ != nullptr);
-    auto status = allocator_->Deallocate(device_ordinal_, wrapped_);
-    if (!status.ok()) {
-      LOG(WARNING) << "Deallocating buffer " << wrapped_.opaque() << " failed";
-    }
+    CHECK(allocator_ != nullptr) << "Owning pointer in inconsistent state";
+    TF_RETURN_IF_ERROR(allocator_->Deallocate(device_ordinal_, wrapped_));
   }
   wrapped_ = DeviceMemory<ElemT>{};
+  return port::Status::OK();
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 38baef64030..dc74a0a2eec 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1024,6 +1024,8 @@ class DnnSupport {
       DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
       DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
       DeviceMemory<float>* reserve_space_2, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
       std::function<const DeviceMemory<float>&()> var_to_inv_var,
       std::function<void()> inv_var_to_var) {
     return false;
@@ -1041,6 +1043,8 @@ class DnnSupport {
       DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
       DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
       DeviceMemory<float>* reserve_space_2, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
       std::function<const DeviceMemory<float>&()> var_to_inv_var,
       std::function<void()> inv_var_to_var) {
     return false;
@@ -1070,7 +1074,9 @@ class DnnSupport {
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
-      DeviceMemory<float>* offset_backprop) {
+      DeviceMemory<float>* offset_backprop,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) {
     return false;
   }
 
@@ -1084,8 +1090,9 @@ class DnnSupport {
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half>* x_backprop,
-      DeviceMemory<float>* scale_backprop,
-      DeviceMemory<float>* offset_backprop) {
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) {
     return false;
   }
 
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 1bbee8f52a7..1981490f7ea 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -1,8 +1,6 @@
 # Description:
 #   GPU-platform specific StreamExecutor support code.
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
@@ -18,6 +16,7 @@ package(
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/stream_executor:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 # Filegroup used to collect source files for the dependency check.
diff --git a/tensorflow/stream_executor/lib/BUILD b/tensorflow/stream_executor/lib/BUILD
index edf495898e9..e1c2a72577b 100644
--- a/tensorflow/stream_executor/lib/BUILD
+++ b/tensorflow/stream_executor/lib/BUILD
@@ -1,15 +1,16 @@
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
 
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
 package_group(
     name = "friends",
     packages = stream_executor_friends(),
 )
 
-package(default_visibility = [":friends"])
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 3c716acb462..272cda1b249 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -306,6 +306,31 @@ void StatusOr<T>::IgnoreError() const {
 }
 
 }  // namespace port
+
+#define TF_ASSERT_OK_AND_ASSIGN(lhs, rexpr)                             \
+  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                         \
+      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, \
+      rexpr);
+
+#define TF_ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr)  \
+  auto statusor = (rexpr);                                  \
+  ASSERT_TRUE(statusor.status().ok()) << statusor.status(); \
+  lhs = std::move(statusor.ValueOrDie())
+
+#define TF_STATUS_MACROS_CONCAT_NAME(x, y) TF_STATUS_MACROS_CONCAT_IMPL(x, y)
+#define TF_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
+
+#define TF_ASSIGN_OR_RETURN(lhs, rexpr) \
+  TF_ASSIGN_OR_RETURN_IMPL(             \
+      TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, rexpr)
+
+#define TF_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr) \
+  auto statusor = (rexpr);                             \
+  if (TF_PREDICT_FALSE(!statusor.ok())) {              \
+    return statusor.status();                          \
+  }                                                    \
+  lhs = std::move(statusor.ValueOrDie())
+
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
diff --git a/tensorflow/stream_executor/platform/default/BUILD b/tensorflow/stream_executor/platform/default/BUILD
index 1be09d8b143..ff2555a2215 100644
--- a/tensorflow/stream_executor/platform/default/BUILD
+++ b/tensorflow/stream_executor/platform/default/BUILD
@@ -2,6 +2,9 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow/stream_executor:__subpackages__"])
 
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+
 cc_library(
     name = "platform",
     textual_hdrs = [
@@ -12,8 +15,12 @@ cc_library(
 
 cc_library(
     name = "dso_loader",
-    srcs = ["dso_loader.cc"],
+    srcs = ["dso_loader.cc"] + if_static(
+        ["dlopen_checker_stub.cc"],
+        ["dlopen_checker.cc"],
+    ),
     hdrs = ["dso_loader.h"],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/stream_executor:platform",
         "//tensorflow/stream_executor/lib",
diff --git a/tensorflow/stream_executor/platform/default/dlopen_checker.cc b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
new file mode 100644
index 00000000000..750c1f29d37
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
@@ -0,0 +1,68 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+namespace internal {
+namespace DsoLoader {
+
+port::Status MaybeTryDlopenCUDALibraries() {
+  auto cudart_status = GetCudaRuntimeDsoHandle();
+  auto cublas_status = GetCublasDsoHandle();
+  auto cufft_status = GetCufftDsoHandle();
+  auto curand_status = GetCurandDsoHandle();
+  auto cusolver_status = GetCusolverDsoHandle();
+  auto cusparse_status = GetCusparseDsoHandle();
+  auto cudnn_status = GetCudnnDsoHandle();
+  if (!cudart_status.status().ok() || !cublas_status.status().ok() ||
+      !cufft_status.status().ok() || !curand_status.status().ok() ||
+      !cusolver_status.status().ok() || !cusparse_status.status().ok() ||
+      !cudnn_status.status().ok()) {
+    return port::Status(port::error::INTERNAL,
+                        absl::StrCat("Cannot dlopen all CUDA libraries."));
+  } else {
+    return port::Status::OK();
+  }
+}
+
+port::Status MaybeTryDlopenROCmLibraries() {
+  auto rocblas_status = GetRocblasDsoHandle();
+  auto miopen_status = GetMiopenDsoHandle();
+  auto rocfft_status = GetRocfftDsoHandle();
+  auto rocrand_status = GetRocrandDsoHandle();
+  if (!rocblas_status.status().ok() || !miopen_status.status().ok() ||
+      !rocfft_status.status().ok() || !rocrand_status.status().ok()) {
+    return port::Status(port::error::INTERNAL,
+                        absl::StrCat("Cannot dlopen all ROCm libraries."));
+  } else {
+    return port::Status::OK();
+  }
+}
+
+port::Status MaybeTryDlopenGPULibraries() {
+#if GOOGLE_CUDA
+  return MaybeTryDlopenCUDALibraries();
+#elif TENSORFLOW_USE_ROCM
+  return MaybeTryDlopenROCmLibraries();
+#else
+  LOG(INFO) << "Not built with GPU enabled. Skip GPU library dlopen check.";
+  return port::Status::OK();
+#endif
+}
+}  // namespace DsoLoader
+}  // namespace internal
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc b/tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc
new file mode 100644
index 00000000000..6376d1075dd
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc
@@ -0,0 +1,30 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+namespace internal {
+namespace DsoLoader {
+
+// Skip check when GPU libraries are statically linked.
+port::Status MaybeTryDlopenGPULibraries() {
+  LOG(INFO) << "GPU libraries are statically linked, skip dlopen check.";
+  return port::Status::OK();
+}
+}  // namespace DsoLoader
+}  // namespace internal
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index d8d0af0aba2..98e58c5b394 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -49,6 +49,12 @@ port::StatusOr<void*> GetMiopenDsoHandle();
 port::StatusOr<void*> GetRocfftDsoHandle();
 port::StatusOr<void*> GetRocrandDsoHandle();
 port::StatusOr<void*> GetHipDsoHandle();
+
+// The following method tries to dlopen all necessary GPU libraries for the GPU
+// platform TF is built with (CUDA or ROCm) only when these libraries should be
+// dynamically loaded. Error status is returned when any of the libraries cannot
+// be dlopened.
+port::Status MaybeTryDlopenGPULibraries();
 }  // namespace DsoLoader
 
 // Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index c64bf4d9ef6..71139c01e23 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -1,8 +1,6 @@
 # Description:
 #   ROCm-platform specific StreamExecutor support code.
 
-licenses(["notice"])  # Apache 2.0
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
     "//tensorflow/stream_executor:build_defs.bzl",
@@ -12,15 +10,16 @@ load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
 package_group(
     name = "friends",
     packages = stream_executor_friends(),
 )
 
-package(
-    default_visibility = [":friends"],
-)
-
 # Filegroup used to collect source files for the dependency check.
 filegroup(
     name = "c_srcs",
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index 25a68810d23..75d5297a4c7 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -3004,6 +3004,8 @@ bool MIOpenSupport::DoBatchNormalizationForward(
     DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
     DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
     DeviceMemory<float>* saved_inv_var, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
     std::function<const DeviceMemory<float>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
   return DoBatchNormalizationForwardImpl<Eigen::half, float>(
@@ -3023,6 +3025,8 @@ bool MIOpenSupport::DoBatchNormalizationForward(
     DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
     DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
     DeviceMemory<float>* saved_inv_var, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator,
     std::function<const DeviceMemory<float>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
   return DoBatchNormalizationForwardImpl<float, float>(
@@ -3089,7 +3093,9 @@ bool MIOpenSupport::DoBatchNormalizationBackward(
     const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop,
-    DeviceMemory<float>* offset_backprop) {
+    DeviceMemory<float>* offset_backprop,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
   return DoBatchNormalizationBackwardImpl<Eigen::half, float>(
       stream, miopenHalf, miopenFloat, y_backprop, x, scale, mean, inv_var,
       x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
@@ -3103,7 +3109,9 @@ bool MIOpenSupport::DoBatchNormalizationBackward(
     const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
-    DeviceMemory<float>* offset_backprop) {
+    DeviceMemory<float>* offset_backprop,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
   return DoBatchNormalizationBackwardImpl<float, float>(
       stream, miopenFloat, miopenFloat, y_backprop, x, scale, mean, variance,
       x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
index 125acf7b474..76e3219884e 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -215,6 +215,8 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
       DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
       DeviceMemory<float>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
       std::function<const DeviceMemory<float>&()> var_to_inv_var,
       std::function<void()> inv_var_to_var) override;
 
@@ -228,6 +230,8 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
       DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
       DeviceMemory<float>* saved_inv_var, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
       std::function<const DeviceMemory<float>&()> var_to_inv_var,
       std::function<void()> inv_var_to_var) override;
 
@@ -238,7 +242,9 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
-      DeviceMemory<float>* offset_backprop) override;
+      DeviceMemory<float>* offset_backprop,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
 
   bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
@@ -247,8 +253,9 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half>* x_backprop,
-      DeviceMemory<float>* scale_backprop,
-      DeviceMemory<float>* offset_backprop) override;
+      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) override;
 
   port::Status DoConvolve(
       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 8f3021929bc..9ccf886bb66 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -347,7 +347,9 @@ Stream &Stream::ThenBatchNormalizationForward(
     DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean,
     DeviceMemory<float> *saved_inv_var, bool is_training,
     std::function<const DeviceMemory<float> &()> var_to_inv_var,
-    std::function<void()> inv_var_to_var) {
+    std::function<void()> inv_var_to_var,
+    ScratchAllocator *reserve_space_allocator,
+    ScratchAllocator *workspace_allocator) {
   VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
   if (ok()) {
@@ -355,7 +357,8 @@ Stream &Stream::ThenBatchNormalizationForward(
       CheckError(dnn->DoBatchNormalizationForward(
           this, x, scale, offset, estimated_mean, estimated_variance, x_desc,
           scale_offset_desc, epsilon, y, batch_mean, batch_var, saved_mean,
-          saved_inv_var, is_training, std::move(var_to_inv_var),
+          saved_inv_var, is_training, reserve_space_allocator,
+          workspace_allocator, std::move(var_to_inv_var),
           std::move(inv_var_to_var)));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -370,7 +373,9 @@ Stream &Stream::ThenBatchNormalizationBackward(
     const DeviceMemory<float> &inv_var, const dnn::BatchDescriptor &x_desc,
     const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
     DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
-    DeviceMemory<float> *offset_backprop) {
+    DeviceMemory<float> *offset_backprop,
+    DeviceMemory<uint8> *reserve_space_data,
+    ScratchAllocator *workspace_allocator) {
   VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
             PARAM(scale_backprop), PARAM(offset_backprop));
@@ -378,7 +383,8 @@ Stream &Stream::ThenBatchNormalizationBackward(
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoBatchNormalizationBackward(
           this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
-          epsilon, x_backprop, scale_backprop, offset_backprop));
+          epsilon, x_backprop, scale_backprop, offset_backprop,
+          reserve_space_data, workspace_allocator));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -397,7 +403,9 @@ Stream &Stream::ThenBatchNormalizationForward(
     DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean,
     DeviceMemory<float> *saved_inv_var, bool is_training,
     std::function<const DeviceMemory<float> &()> var_to_inv_var,
-    std::function<void()> inv_var_to_var) {
+    std::function<void()> inv_var_to_var,
+    ScratchAllocator *reserve_space_allocator,
+    ScratchAllocator *workspace_allocator) {
   VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
   if (ok()) {
@@ -405,7 +413,8 @@ Stream &Stream::ThenBatchNormalizationForward(
       CheckError(dnn->DoBatchNormalizationForward(
           this, x, scale, offset, estimated_mean, estimated_variance, x_desc,
           scale_offset_desc, epsilon, y, batch_mean, batch_var, saved_mean,
-          saved_inv_var, is_training, std::move(var_to_inv_var),
+          saved_inv_var, is_training, reserve_space_allocator,
+          workspace_allocator, std::move(var_to_inv_var),
           std::move(inv_var_to_var)));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -421,7 +430,9 @@ Stream &Stream::ThenBatchNormalizationBackward(
     const dnn::BatchDescriptor &x_desc,
     const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half> *x_backprop, DeviceMemory<float> *scale_backprop,
-    DeviceMemory<float> *offset_backprop) {
+    DeviceMemory<float> *offset_backprop,
+    DeviceMemory<uint8> *reserve_space_data,
+    ScratchAllocator *workspace_allocator) {
   VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
             PARAM(scale_backprop), PARAM(offset_backprop));
@@ -429,7 +440,9 @@ Stream &Stream::ThenBatchNormalizationBackward(
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoBatchNormalizationBackward(
           this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
-          epsilon, x_backprop, scale_backprop, offset_backprop));
+          epsilon, x_backprop, scale_backprop, offset_backprop,
+          reserve_space_data, workspace_allocator));
+
     } else {
       SetErrorAndLogNoDnnSupport();
     }
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index cdd3464719b..49a686ab41c 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -240,7 +240,9 @@ class Stream {
       DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean,
       DeviceMemory<float> *saved_inv_var, bool is_training,
       std::function<const DeviceMemory<float> &()> var_to_inv_var,
-      std::function<void()> inv_var_to_var);
+      std::function<void()> inv_var_to_var,
+      ScratchAllocator *reserve_space_allocator,
+      ScratchAllocator *workspace_allocator);
 
   Stream &ThenBatchNormalizationBackward(
       const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
@@ -248,7 +250,9 @@ class Stream {
       const DeviceMemory<float> &inv_var, const dnn::BatchDescriptor &x_desc,
       const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
       DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
-      DeviceMemory<float> *offset_backprop);
+      DeviceMemory<float> *offset_backprop,
+      DeviceMemory<uint8> *reserve_space_data,
+      ScratchAllocator *workspace_allocator);
 
   Stream &ThenBatchNormalizationForward(
       const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
@@ -261,7 +265,9 @@ class Stream {
       DeviceMemory<float> *batch_var, DeviceMemory<float> *saved_mean,
       DeviceMemory<float> *saved_inv_var, bool is_training,
       std::function<const DeviceMemory<float> &()> var_to_inv_var,
-      std::function<void()> inv_var_to_var);
+      std::function<void()> inv_var_to_var,
+      ScratchAllocator *reserve_space_allocator,
+      ScratchAllocator *workspace_allocator);
 
   Stream &ThenBatchNormalizationBackward(
       const DeviceMemory<Eigen::half> &y_backprop,
@@ -270,8 +276,9 @@ class Stream {
       const dnn::BatchDescriptor &x_desc,
       const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half> *x_backprop,
-      DeviceMemory<float> *scale_backprop,
-      DeviceMemory<float> *offset_backprop);
+      DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
+      DeviceMemory<uint8> *reserve_space_data,
+      ScratchAllocator *workspace_allocator);
 
   Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
                        const DeviceMemory<float> &input_data,
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 715d2f9cbbf..6a7ebb51260 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/const_init.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/notification.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rng.h"
@@ -65,8 +67,8 @@ std::atomic_int_fast64_t correlation_id_generator(0);
 
 }  // namespace
 
-template <typename BeginCallT, typename CompleteCallT,
-          typename ReturnT, typename... BeginArgsT>
+template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
+          typename... BeginArgsT>
 class ScopedTracer {
  public:
   ScopedTracer(StreamExecutor *stream_exec, BeginCallT begin_call,
@@ -103,7 +105,7 @@ class ScopedTracer {
 
   StreamExecutor *stream_exec_;
   CompleteCallT complete_call_;
-  const ReturnT* result_;
+  const ReturnT *result_;
   int64 correlation_id_;
 };
 
@@ -118,9 +120,9 @@ MakeScopedTracer(StreamExecutor *stream_exec, BeginCallT begin_call,
       std::forward<BeginArgsT>(begin_args)...);
 }
 
-#define SCOPED_TRACE(LOC, ...)                                      \
-  auto tracer = MakeScopedTracer(this, &LOC ## Begin,               \
-                                 &LOC ## Complete, ## __VA_ARGS__);
+#define SCOPED_TRACE(LOC, ...) \
+  auto tracer =                \
+      MakeScopedTracer(this, &LOC##Begin, &LOC##Complete, ##__VA_ARGS__);
 
 /* static */ absl::Mutex StreamExecutor::static_mu_{absl::kConstInit};
 
@@ -144,8 +146,7 @@ StreamExecutor::StreamExecutor(
       live_stream_count_(0),
       tracing_enabled_(false),
       mem_alloc_bytes_(0),
-      memory_limit_bytes_(GetMemoryLimitBytes()),
-      allocator_(this) {
+      memory_limit_bytes_(GetMemoryLimitBytes()) {
   string name = absl::AsciiStrToLower(platform_->Name());
   if (name == "cuda") {
     platform_kind_ = PlatformKind::kCuda;
@@ -784,8 +785,7 @@ void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
 void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
   if (FLAGS_check_device_leaks && opaque != nullptr && bytes != 0) {
     absl::MutexLock lock(&mu_);
-    mem_allocs_[opaque] = AllocRecord{
-        bytes, ""};
+    mem_allocs_[opaque] = AllocRecord{bytes, ""};
     mem_alloc_bytes_ += bytes;
   }
 }
@@ -855,8 +855,9 @@ internal::StreamExecutorInterface *StreamExecutor::implementation() {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
     StreamExecutor *executor)
-    : DeviceMemoryAllocator(executor->platform()),
-      stream_executors_({executor}) {}
+    : DeviceMemoryAllocator(executor->platform()) {
+  stream_executors_ = {executor};
+}
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
     const Platform *platform,
@@ -866,15 +867,14 @@ StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
 
 port::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
-  port::StatusOr<StreamExecutor *> stream_executor_or =
-      GetStreamExecutor(device_ordinal);
-  TF_RETURN_IF_ERROR(stream_executor_or.status());
-  DeviceMemoryBase result =
-      stream_executor_or.ValueOrDie()->AllocateArray<uint8>(size);
+  TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
+                      GetStreamExecutor(device_ordinal));
+  DeviceMemoryBase result = executor->AllocateArray<uint8>(size);
   if (size > 0 && result == nullptr) {
-    return tensorflow::errors::ResourceExhausted(
+    return tensorflow::errors::ResourceExhausted(absl::StrFormat(
         "Failed to allocate request for %s (%uB) on device ordinal %d",
-        tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal);
+        tensorflow::strings::HumanReadableNumBytes(size), size,
+        device_ordinal));
   }
   VLOG(3) << absl::StreamFormat(
       "Allocated %s (%uB) on device ordinal %d: %p",
@@ -886,12 +886,11 @@ port::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
 port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
                                                        DeviceMemoryBase mem) {
   if (!mem.is_null()) {
-    port::StatusOr<StreamExecutor *> stream_executor_or =
-        GetStreamExecutor(device_ordinal);
-    TF_RETURN_IF_ERROR(stream_executor_or.status());
+    TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
+                        GetStreamExecutor(device_ordinal));
     VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
                                   mem.opaque(), device_ordinal);
-    stream_executor_or.ValueOrDie()->Deallocate(&mem);
+    executor->Deallocate(&mem);
   }
   return port::Status::OK();
 }
@@ -899,20 +898,17 @@ port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
 port::StatusOr<StreamExecutor *>
 StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
   if (device_ordinal < 0) {
-    return tensorflow::errors::InvalidArgument(
-        "device ordinal value (%d) must be non-negative", device_ordinal);
+    return tensorflow::errors::InvalidArgument(absl::StrFormat(
+        "device ordinal value (%d) must be non-negative", device_ordinal));
   }
-  if (device_ordinal >= stream_executors_.size()) {
-    return tensorflow::errors::InvalidArgument(
-        "device ordinal value (%d) >= number of devices (%u)", device_ordinal,
-        stream_executors_.size());
+  for (StreamExecutor *se : stream_executors_) {
+    if (se->device_ordinal() == device_ordinal) {
+      return se;
+    }
   }
-  if (stream_executors_[device_ordinal] == nullptr) {
-    return tensorflow::errors::NotFound(
-        absl::StrFormat("Device %s:%d present but not supported",
-                        platform()->Name(), device_ordinal));
-  }
-  return stream_executors_[device_ordinal];
+  return tensorflow::errors::NotFound(
+      absl::StrFormat("Device %s:%d present but not supported",
+                      platform()->Name(), device_ordinal));
 }
 
 bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 631cd0b7083..fe247160df4 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -479,7 +479,15 @@ class StreamExecutor {
 
   // Return an allocator which delegates to this stream executor for memory
   // allocation.
-  StreamExecutorMemoryAllocator *GetAllocator() { return &allocator_; }
+  //
+  // Creates the allocator object on the first access, as the device ordinal
+  // of this stream_executor is not set in constructor.
+  StreamExecutorMemoryAllocator *GetAllocator() {
+    if (!allocator_.has_value()) {
+      allocator_.emplace(this);
+    }
+    return &allocator_.value();
+  }
 
  private:
   template <typename BeginCallT, typename CompleteCallT,
@@ -711,7 +719,7 @@ class StreamExecutor {
   // limit.
   int64 memory_limit_bytes_;
 
-  StreamExecutorMemoryAllocator allocator_;
+  absl::optional<StreamExecutorMemoryAllocator> allocator_;
 
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
 };
@@ -787,7 +795,7 @@ ScopedDeviceMemory<ElemT>::ScopedDeviceMemory(
     std::vector<ElemT> local(values);
     if (!parent->SynchronousMemcpy(ptr(), const_cast<const ElemT *>(&local[0]),
                                    ptr()->size())) {
-      Free();
+      TF_CHECK_OK(Free());
     }
   }
 }
diff --git a/tensorflow/stream_executor/tf_allocator_adapter.cc b/tensorflow/stream_executor/tf_allocator_adapter.cc
new file mode 100644
index 00000000000..892673d63e6
--- /dev/null
+++ b/tensorflow/stream_executor/tf_allocator_adapter.cc
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/tf_allocator_adapter.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/stream_executor/lib/error.h"
+
+namespace stream_executor {
+
+TfAllocatorAdapter::TfAllocatorAdapter(const Platform *platform,
+                                       tensorflow::Allocator *wrapped)
+    : DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
+
+TfAllocatorAdapter::~TfAllocatorAdapter() {}
+
+port::StatusOr<OwningDeviceMemory> TfAllocatorAdapter::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
+  tensorflow::AllocationAttributes attrs;
+  attrs.no_retry_on_failure = !retry_on_failure;
+  void *data = nullptr;
+  if (size != 0) {
+    data = wrapped_->AllocateRaw(tensorflow::Allocator::kAllocatorAlignment,
+                                 size, attrs);
+    if (data == nullptr) {
+      return tensorflow::errors::ResourceExhausted(
+          "Out of memory while trying to allocate ", size, " bytes.");
+    }
+  }
+  return OwningDeviceMemory(DeviceMemoryBase(data, size), device_ordinal, this);
+}
+
+port::Status TfAllocatorAdapter::Deallocate(int device_ordinal,
+                                            DeviceMemoryBase mem) {
+  wrapped_->DeallocateRaw(mem.opaque());
+  return port::Status::OK();
+}
+
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/tf_allocator_adapter.h b/tensorflow/stream_executor/tf_allocator_adapter.h
new file mode 100644
index 00000000000..3ab15d2ae66
--- /dev/null
+++ b/tensorflow/stream_executor/tf_allocator_adapter.h
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+
+namespace stream_executor {
+
+// Adapter class that wraps a Tensorflow allocator.
+//
+// Assumes that the Tensorflow allocator permits asynchronous deallocation:
+// see comment on `AllowsAsynchronousDeallocation()`.
+class TfAllocatorAdapter : public DeviceMemoryAllocator {
+ public:
+  TfAllocatorAdapter(const Platform *platform, tensorflow::Allocator *wrapped);
+  ~TfAllocatorAdapter() override;
+
+  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                              bool retry_on_failure) override;
+
+  port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;
+
+  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
+  // before GPU execution takes place. Tensorflow uses the ordering of the main
+  // compute stream to enforce a happens-before relationship between a memory
+  // allocation and code that reuses the same memory. If Tensorflow adds
+  // support for multiple GPU streams or allocators with different ordering
+  // requirements, this code may need to change.
+  // (This attribute has no effect on CPU.)
+  bool AllowsAsynchronousDeallocation() const override { return true; }
+
+ private:
+  tensorflow::Allocator *wrapped_;
+};
+
+// Adapter class that wraps per-device TF allocators as an XLA allocator.
+// Assumes that the Tensorflow allocator permits asynchronous deallocation;
+// see comment on `AllowsAsynchronousDeallocation()`.
+class MultiDeviceAdapter : public DeviceMemoryAllocator {
+ public:
+  MultiDeviceAdapter(
+      const Platform *platform,
+      std::vector<std::unique_ptr<tensorflow::Allocator>> tf_allocators)
+      : DeviceMemoryAllocator(platform),
+        tf_allocators_(std::move(tf_allocators)) {
+    for (const auto &tf_allocator : tf_allocators_) {
+      per_device_allocators_.emplace_back(platform, tf_allocator.get());
+    }
+  }
+
+  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                              bool retry_on_failure) override {
+    CHECK_LT(device_ordinal, per_device_allocators_.size());
+    return per_device_allocators_[device_ordinal].Allocate(device_ordinal, size,
+                                                           retry_on_failure);
+  }
+
+  port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override {
+    CHECK_LT(device_ordinal, per_device_allocators_.size());
+    return per_device_allocators_[device_ordinal].Deallocate(device_ordinal,
+                                                             mem);
+  }
+
+  // The Tensorflow BFC allocator used on GPU allows host-side deallocation
+  // before GPU execution takes place. Tensorflow uses the ordering of the main
+  // compute stream to enforce a happens-before relationship between a memory
+  // allocation and code that reuses the same memory. If Tensorflow adds
+  // support for multiple GPU streams or allocators with different ordering
+  // requirements, this code may need to change.
+  // (This attribute has no effect on CPU.)
+  bool AllowsAsynchronousDeallocation() const override { return true; }
+
+ private:
+  std::vector<TfAllocatorAdapter> per_device_allocators_;
+  // The wrapped TF allocators backing per_device_allocators_ (XlaAllocator does
+  // not take ownership of its underlying Allocator).
+  std::vector<std::unique_ptr<tensorflow::Allocator>> tf_allocators_;
+};
+
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_TF_ALLOCATOR_ADAPTER_H_
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 66ce1465f31..09975868ebb 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1971,6 +1971,7 @@ def tf_py_wrap_cc(
 #    //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
 def py_test(deps = [], data = [], kernels = [], **kwargs):
+    # Python version placeholder
     native.py_test(
         # TODO(jlebar): Ideally we'd use tcmalloc here.,
         deps = select({
@@ -1999,6 +2000,8 @@ def py_binary(name, deps = [], **kwargs):
         name = name + "_deps",
         deps = deps,
     )
+
+    # Python version placeholder
     native.py_binary(
         name = name,
         deps = select({
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
index 4389a999e71..1e1c9e7198e 100644
--- a/tensorflow/tools/api/golden/BUILD
+++ b/tensorflow/tools/api/golden/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 filegroup(
     name = "api_golden_v1",
     srcs = glob(["v1/*.pbtxt"]),
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index 09ec3e7acfa..7ee9c9bf666 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -58,7 +58,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'shape\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "assign"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
index d283fb8e14f..5d17918107c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
@@ -18,11 +18,11 @@ tf_class {
     mtype: "<enum \'Feature\'>"
   }
   member {
-    name: "LISTS"
+    name: "EQUALITY_OPERATORS"
     mtype: "<enum \'Feature\'>"
   }
   member {
-    name: "LOGICAL_EXPRESSIONS"
+    name: "LISTS"
     mtype: "<enum \'Feature\'>"
   }
   member {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
index b788f694a6b..8390f14cee8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Feature"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member_method {
+    name: "do_not_convert"
+    argspec: "args=[\'run_as\', \'return_dtypes\'], varargs=None, keywords=None, defaults=[\'RunMode.GRAPH\', \'None\'], "
+  }
   member_method {
     name: "set_loop_options"
     argspec: "args=[\'parallel_iterations\', \'back_prop\', \'swap_memory\', \'maximum_iterations\'], varargs=None, keywords=None, defaults=[\'<object object instance>\', \'<object object instance>\', \'<object object instance>\', \'<object object instance>\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 94ffbca003f..cedf443100c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -133,6 +133,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 0ed2d44e551..de0c0d4cfe6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -135,6 +135,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 60f7e1f4c72..9a0ca3467ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -135,6 +135,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index d335061158d..4a3ac523fe6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -135,6 +135,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 39431952268..6bdfd7c818d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -135,6 +135,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 6221aaa0b0d..e34175ca9a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -135,6 +135,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index d1903301787..bb7b40e9f6f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -135,6 +135,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index be995ddb88f..2e4b631d5a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
index def51768a8b..fb77a1ce47e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index f0ae286fa2e..e71f3fdc7de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -27,6 +27,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 1e4346725d9..757e52063fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 526af67e520..0edbc01c6a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 07a8e0dcf67..c751ef90ee7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index b18d33be7ce..438f48a17b1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -32,6 +32,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
index ad42c6b75b3..9b736df5819 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -14,7 +14,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.manip.pbtxt
index a07e72edd25..d28fe9a5f72 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.manip.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.manip.pbtxt
@@ -18,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "roll"
-    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'input\', \'shift\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "scatter_nd"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index ae66ee8febd..ce3992ba351 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1104,6 +1104,10 @@ tf_module {
     name: "dimension_value"
     argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_control_flow_v2"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_eager_execution"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1148,6 +1152,10 @@ tf_module {
     name: "einsum"
     argspec: "args=[\'equation\'], varargs=inputs, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "enable_control_flow_v2"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "enable_eager_execution"
     argspec: "args=[\'config\', \'device_policy\', \'execution_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
@@ -1286,7 +1294,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\', \'experimental_relax_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\'], "
   }
   member_method {
     name: "gather"
@@ -1902,7 +1910,7 @@ tf_module {
   }
   member_method {
     name: "roll"
-    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'input\', \'shift\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "round"
@@ -2436,6 +2444,10 @@ tf_module {
     name: "where"
     argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "where_v2"
+    argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "while_loop"
     argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 12e668952bc..edd22f96129 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1408,10 +1408,18 @@ tf_module {
     name: "FusedBatchNormGradV2"
     argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "FusedBatchNormGradV3"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'reserve_space_3\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
   member_method {
     name: "FusedBatchNormV2"
     argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "FusedBatchNormV3"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
   member_method {
     name: "FusedPadConv2D"
     argspec: "args=[\'input\', \'paddings\', \'filter\', \'mode\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1430,7 +1438,7 @@ tf_module {
   }
   member_method {
     name: "GatherV2"
-    argspec: "args=[\'params\', \'indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'params\', \'indices\', \'axis\', \'batch_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
   member_method {
     name: "GenerateVocabRemapping"
@@ -3352,6 +3360,10 @@ tf_module {
     name: "Select"
     argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SelectV2"
+    argspec: "args=[\'condition\', \'t\', \'e\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SelfAdjointEig"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3462,7 +3474,7 @@ tf_module {
   }
   member_method {
     name: "SnapshotDataset"
-    argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'compression\', \'reader_path_prefix\', \'writer_path_prefix\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "Softmax"
@@ -3968,10 +3980,6 @@ tf_module {
     name: "TPUPartitionedCall"
     argspec: "args=[\'args\', \'device_ordinal\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "TPUReplicate"
-    argspec: "args=[\'inputs\', \'broadcast_inputs\', \'variables\', \'guaranteed_constants\', \'computation\', \'num_replicas\', \'output_types\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
-  }
   member_method {
     name: "TPUReplicateMetadata"
     argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
index ea717b4d719..f1b8dcd39e8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "fft3d"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fftshift"
+    argspec: "args=[\'x\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "frame"
     argspec: "args=[\'signal\', \'frame_length\', \'frame_step\', \'pad_end\', \'pad_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'-1\', \'None\'], "
@@ -44,6 +48,10 @@ tf_module {
     name: "ifft3d"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "ifftshift"
+    argspec: "args=[\'x\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "inverse_stft"
     argspec: "args=[\'stfts\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-looper-thread.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-looper-thread.pbtxt
index c61859004e8..7082c0978cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-looper-thread.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-looper-thread.pbtxt
@@ -34,10 +34,6 @@ tf_class {
     name: "is_alive"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "join"
-    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "loop"
     argspec: "args=[\'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index 551fda2eacd..3527de0bf30 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -280,6 +280,10 @@ tf_module {
     name: "checkpoint_exists"
     argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "checkpoints_iterator"
+    argspec: "args=[\'checkpoint_dir\', \'min_interval_secs\', \'timeout\', \'timeout_fn\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
+  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index 5ecfa493b56..eac45886781 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -57,7 +57,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'import_scope\', \'constraint\', \'synchronization\', \'aggregation\', \'shape\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'import_scope\', \'constraint\', \'synchronization\', \'aggregation\', \'shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
   }
   member_method {
     name: "assign"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
index d283fb8e14f..5d17918107c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
@@ -18,11 +18,11 @@ tf_class {
     mtype: "<enum \'Feature\'>"
   }
   member {
-    name: "LISTS"
+    name: "EQUALITY_OPERATORS"
     mtype: "<enum \'Feature\'>"
   }
   member {
-    name: "LOGICAL_EXPRESSIONS"
+    name: "LISTS"
     mtype: "<enum \'Feature\'>"
   }
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
index b788f694a6b..8390f14cee8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Feature"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member_method {
+    name: "do_not_convert"
+    argspec: "args=[\'run_as\', \'return_dtypes\'], varargs=None, keywords=None, defaults=[\'RunMode.GRAPH\', \'None\'], "
+  }
   member_method {
     name: "set_loop_options"
     argspec: "args=[\'parallel_iterations\', \'back_prop\', \'swap_memory\', \'maximum_iterations\'], varargs=None, keywords=None, defaults=[\'<object object instance>\', \'<object object instance>\', \'<object object instance>\', \'<object object instance>\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index bb56967c18a..f85436d45d0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 597c5bce102..2f9c7de6f79 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -102,6 +102,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index c24bac5bd95..a09cff47376 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -101,6 +101,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 8946cecfc83..1e3fb4a7010 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -102,6 +102,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2365c62a61c..38447355e7f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -102,6 +102,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index af008c6ad5b..3fa00204d59 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -102,6 +102,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 34370adc7da..549f5da506b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -102,6 +102,10 @@ tf_class {
     name: "take"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "unbatch"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "window"
     argspec: "args=[\'self\', \'size\', \'shift\', \'stride\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index b03bfef7bde..29dda65bd25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -27,6 +27,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index 5ba73b84e5a..14efe9db502 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -27,6 +27,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 3a9002035da..113639eebeb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -26,6 +26,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index c9ca036bb72..0ced0b1fc16 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -27,6 +27,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 6591d751ff0..192543088b9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -27,6 +27,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index f8371d2d63c..9319e38cf73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -27,6 +27,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 3d0f6683edf..cb067ab712e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -27,6 +27,10 @@ tf_class {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
index ad42c6b75b3..9b736df5819 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -14,7 +14,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
index ee4c9ad25fc..43bf48ef5d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -14,7 +14,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
   }
   member_method {
     name: "add_slot"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 00eb4ddc75d..7010259aa5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -622,7 +622,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\', \'experimental_relax_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'False\'], "
   }
   member_method {
     name: "gather"
@@ -882,7 +882,7 @@ tf_module {
   }
   member_method {
     name: "roll"
-    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'input\', \'shift\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "round"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 12e668952bc..edd22f96129 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1408,10 +1408,18 @@ tf_module {
     name: "FusedBatchNormGradV2"
     argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "FusedBatchNormGradV3"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'reserve_space_3\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
   member_method {
     name: "FusedBatchNormV2"
     argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "FusedBatchNormV3"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+  }
   member_method {
     name: "FusedPadConv2D"
     argspec: "args=[\'input\', \'paddings\', \'filter\', \'mode\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1430,7 +1438,7 @@ tf_module {
   }
   member_method {
     name: "GatherV2"
-    argspec: "args=[\'params\', \'indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'params\', \'indices\', \'axis\', \'batch_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
   }
   member_method {
     name: "GenerateVocabRemapping"
@@ -3352,6 +3360,10 @@ tf_module {
     name: "Select"
     argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SelectV2"
+    argspec: "args=[\'condition\', \'t\', \'e\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SelfAdjointEig"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3462,7 +3474,7 @@ tf_module {
   }
   member_method {
     name: "SnapshotDataset"
-    argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'compression\', \'reader_path_prefix\', \'writer_path_prefix\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "Softmax"
@@ -3968,10 +3980,6 @@ tf_module {
     name: "TPUPartitionedCall"
     argspec: "args=[\'args\', \'device_ordinal\', \'Tout\', \'f\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "TPUReplicate"
-    argspec: "args=[\'inputs\', \'broadcast_inputs\', \'variables\', \'guaranteed_constants\', \'computation\', \'num_replicas\', \'output_types\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
-  }
   member_method {
     name: "TPUReplicateMetadata"
     argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
index ea717b4d719..f1b8dcd39e8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "fft3d"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fftshift"
+    argspec: "args=[\'x\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "frame"
     argspec: "args=[\'signal\', \'frame_length\', \'frame_step\', \'pad_end\', \'pad_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'-1\', \'None\'], "
@@ -44,6 +48,10 @@ tf_module {
     name: "ifft3d"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "ifftshift"
+    argspec: "args=[\'x\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "inverse_stft"
     argspec: "args=[\'stfts\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
index f5323324846..381cc5a8cfe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.experimental.pbtxt
@@ -16,12 +16,4 @@ tf_module {
     name: "PythonState"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "disable_mixed_precision_graph_rewrite"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "enable_mixed_precision_graph_rewrite"
-    argspec: "args=[\'opt\', \'loss_scale\'], varargs=None, keywords=None, defaults=[\'dynamic\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index 5def673b407..13dc9829d66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -72,6 +72,10 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "checkpoints_iterator"
+    argspec: "args=[\'checkpoint_dir\', \'min_interval_secs\', \'timeout\', \'timeout_fn\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
+  }
   member_method {
     name: "get_checkpoint_state"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
index 3f4fb910427..20ad8f9af9f 100644
--- a/tensorflow/tools/api/lib/BUILD
+++ b/tensorflow/tools/api/lib/BUILD
@@ -2,10 +2,9 @@
 
 package(
     default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 88c73d624f6..c43592f3f2b 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -41,6 +41,9 @@ _CORNER_CASES = {
     'estimator.NanLossDuringTrainingError': {
         'message': {}
     },
+    'train.LooperThread': {
+        'join': {}
+    }
 }
 
 # Python 2 vs. 3 differences
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 7f7748c6ab6..efb73692bde 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -7,10 +7,9 @@ load(
 
 package(
     default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files([
     "README.txt",
     "API_UPDATE_WARNING.txt",
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 566a172ea77..318dfdd8a1d 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -1,15 +1,16 @@
 # Description:
 #   Benchmark utility that can run on desktop and Android.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_copts",
-    "tf_cc_test",
     "tf_cc_binary",
+    "tf_cc_test",
+    "tf_copts",
 )
 
 exports_files(["LICENSE"])
diff --git a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
index ad82c88b4a6..438ffa4821d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
@@ -5,7 +5,7 @@ LABEL maintainer="Jan Prach <jendap@google.com>"
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
-RUN echo "deb http://www.debian.net/debian jessie-backports main" | \
+RUN echo "deb http://archive.debian.org/debian jessie-backports main" | \
     tee -a /etc/apt/sources.list
 # Workaround bug in Jessie backport repository deb packages
 # http://serverfault.com/questions/830636/cannot-install-openjdk-8-jre-headless-on-debian-jessie
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu-centos6 b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu-centos6
index a748095a040..469ba08b97d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu-centos6
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu-centos6
@@ -25,7 +25,8 @@ env PATH="/opt/rh/python27/root/usr/bin:/opt/rh/devtoolset-7/root/usr/bin:/usr/l
 
 # Install pip packages needed to build tensorflow.
 COPY install/*.sh /install/
-RUN bash install/install_centos_python35.sh
+RUN bash install/install_yum_packages.sh
+RUN bash install/install_centos_python36.sh
 RUN bash install/install_centos_pip_packages.sh
 
 # Install a /usr/bin/python3 link.
@@ -34,7 +35,7 @@ RUN bash install/install_centos_pip_packages.sh
 # the python path between the local machine running bazel and the remote setup
 # must be the same.
 RUN update-alternatives --install /usr/bin/python2 python2 /opt/rh/python27/root/usr/bin/python2.7 0
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.5 0
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.6 0
 
 # Install a ubuntu-compatible openjdk link so that ubuntu JAVA_HOME works
 # for this image.
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index c288ea439c7..da3c6b89b2b 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -294,7 +294,7 @@ do_buildifier(){
 
   rm -rf ${BUILDIFIER_OUTPUT_FILE}
 
-  buildifier -showlog -v -mode=check \
+  buildifier -v -mode=check \
     ${BUILD_FILES} 2>&1 | tee ${BUILDIFIER_OUTPUT_FILE}
   BUILDIFIER_END_TIME=$(date +'%s')
 
@@ -305,7 +305,7 @@ do_buildifier(){
   if [[ -s ${BUILDIFIER_OUTPUT_FILE} ]]; then
     echo "FAIL: buildifier found errors and/or warnings in above BUILD files."
     echo "buildifier suggested the following changes:"
-    buildifier -showlog -v -mode=diff ${BUILD_FILES}
+    buildifier -v -mode=diff ${BUILD_FILES}
     echo "Please fix manually or run buildifier <file> to auto-fix."
     return 1
   else
@@ -579,9 +579,24 @@ do_pip_no_cuda_deps_check() {
    fi
   done
 }
+
+do_configure_test() {
+  for WITH_CUDA in 1 0
+  do
+    export TF_NEED_CUDA=${WITH_CUDA}
+    export PYTHON_BIN_PATH=$(which python)
+    yes "" | ./configure
+
+    RESULT=$?
+    if [[ ${RESULT} != "0" ]]; then
+     exit 1
+    fi
+  done
+}
+
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_pip_no_cuda_deps_check")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check gpu pip package does not depend on cuda shared libraries.")
+SANITY_STEPS=("do_configure_test" "do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_pip_no_cuda_deps_check")
+SANITY_STEPS_DESC=("Run ./configure" "Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check gpu pip package does not depend on cuda shared libraries.")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS=""
diff --git a/tensorflow/tools/ci_build/install/install_bootstrap_deb_packages.sh b/tensorflow/tools/ci_build/install/install_bootstrap_deb_packages.sh
index 15526ef4f8c..a8be075e3cf 100755
--- a/tensorflow/tools/ci_build/install/install_bootstrap_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_bootstrap_deb_packages.sh
@@ -19,6 +19,6 @@ set -e
 # Install bootstrap dependencies from ubuntu deb repository.
 apt-get update
 apt-get install -y --no-install-recommends \
-    software-properties-common
+    apt-transport-https ca-certificates software-properties-common
 apt-get clean
 rm -rf /var/lib/apt/lists/*
diff --git a/tensorflow/tools/ci_build/install/install_centos_python35.sh b/tensorflow/tools/ci_build/install/install_centos_python36.sh
similarity index 71%
rename from tensorflow/tools/ci_build/install/install_centos_python35.sh
rename to tensorflow/tools/ci_build/install/install_centos_python36.sh
index e09e522e521..2387eb8838c 100644
--- a/tensorflow/tools/ci_build/install/install_centos_python35.sh
+++ b/tensorflow/tools/ci_build/install/install_centos_python36.sh
@@ -14,12 +14,13 @@
 # limitations under the License.
 # ==============================================================================
 
-yum install epel-release
-curl "https://setup.ius.io/" -o setup-ius.sh
-sh setup-ius.sh
-rm setup-ius.sh
-yum --enablerepo=ius install -y python35u
+cd /usr/src
+wget https://www.python.org/ftp/python/3.6.8/Python-3.6.8.tgz
+tar xzf Python-3.6.8.tgz
+cd Python-3.6.8
+./configure --enable-optimizations
+make altinstall
+rm /usr/src/Python-3.6.8.tgz
 
-curl "https://bootstrap.pypa.io/get-pip.py" -o "get-pip.py"
-python3.5 get-pip.py
-rm get-pip.py
+# Link the pip3.6 executable to pip3.
+ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b5da21cba4e..d2f80fdefe6 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -39,9 +39,11 @@ pip3 install --upgrade setuptools==39.1.0
 pip2 install virtualenv
 pip3 install virtualenv
 
-# Install six.
+# Install six and future.
 pip2 install --upgrade six==1.12.0
 pip3 install --upgrade six==1.12.0
+pip2 install future
+pip3 install future
 
 # Install absl-py.
 pip2 install --upgrade absl-py
diff --git a/tensorflow/tools/ci_build/install/install_yum_packages.sh b/tensorflow/tools/ci_build/install/install_yum_packages.sh
index 665409ae6c1..e11a98ee8f2 100755
--- a/tensorflow/tools/ci_build/install/install_yum_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_yum_packages.sh
@@ -40,8 +40,6 @@ yum install -y  atlas-devel \
                 perl-core \
                 python27 \
                 readline-devel \
-                rh-python35 \
-                rh-python36 \
                 sqlite-devel \
                 wget \
                 xz-devel \
diff --git a/tensorflow/tools/ci_build/linux/mkl/set-build-env.py b/tensorflow/tools/ci_build/linux/mkl/set-build-env.py
index ba19c6174b7..1c6f1dffde2 100755
--- a/tensorflow/tools/ci_build/linux/mkl/set-build-env.py
+++ b/tensorflow/tools/ci_build/linux/mkl/set-build-env.py
@@ -106,8 +106,9 @@ class BuildEnvSetter(object):
         raise ValueError(
             "{} does not exist or is not executable.".format(gcc_path))
 
-      gcc_output = subprocess.check_output([gcc_path, "-dumpversion"],
-                                           stderr=subprocess.STDOUT)
+      gcc_output = subprocess.check_output(
+          [gcc_path, "-dumpfullversion", "-dumpversion"],
+          stderr=subprocess.STDOUT).strip()
       # handle python2 vs 3 (bytes vs str type)
       if isinstance(gcc_output, bytes):
         gcc_output = gcc_output.decode("utf-8")
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index 05fc81b0d97..c438c89be2d 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -1,14 +1,13 @@
 # Description:
 #   Common functionality for TensorFlow tooling
 
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index d55b52fb3c4..c4fc1a993df 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -1,13 +1,14 @@
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_copts",  # @unused
-    "tf_cc_test",  # @unused
     "py_test",
+    "tf_cc_test",  # @unused
+    "tf_copts",  # @unused
 )
 
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 py_library(
     name = "ipynb",
diff --git a/tensorflow/tools/compatibility/all_renames_v2.py b/tensorflow/tools/compatibility/all_renames_v2.py
index a31c15a5e03..ab88da39216 100644
--- a/tensorflow/tools/compatibility/all_renames_v2.py
+++ b/tensorflow/tools/compatibility/all_renames_v2.py
@@ -69,6 +69,22 @@ manual_symbol_renames = {
         "tf.io.gfile.stat",
     "tf.gfile.Walk":
         "tf.io.gfile.walk",
+    "tf.contrib.cluster_resolver.ClusterResolver":
+        "tf.distribute.cluster_resolver.ClusterResolver",
+    "tf.contrib.cluster_resolver.GceClusterResolver":
+        "tf.distribute.cluster_resolver.GCEClusterResolver",
+    "tf.contrib.cluster_resolver.KubernetesClusterResolver":
+        "tf.distribute.cluster_resolver.KubernetesClusterResolver",
+    "tf.contrib.cluster_resolver.SimpleClusterResolver":
+        "tf.distribute.cluster_resolver.SimpleClusterResolver",
+    "tf.contrib.cluster_resolver.SlurmClusterResolver":
+        "tf.distribute.cluster_resolver.SlurmClusterResolver",
+    "tf.contrib.cluster_resolver.TFConfigClusterResolver":
+        "tf.distribute.cluster_resolver.TFConfigClusterResolver",
+    "tf.contrib.cluster_resolver.TPUClusterResolver":
+        "tf.distribute.cluster_resolver.TPUClusterResolver",
+    "tf.contrib.cluster_resolver.UnionClusterResolver":
+        "tf.distribute.cluster_resolver.UnionClusterResolver",
     "tf.contrib.data.AUTOTUNE":
         "tf.data.experimental.AUTOTUNE",
     "tf.contrib.data.Counter":
@@ -257,6 +273,8 @@ manual_symbol_renames = {
         "tf.compat.v1.tpu.shard",
     "tf.contrib.tpu.shutdown_system":
         "tf.compat.v1.tpu.shutdown_system",
+    "tf.contrib.training.checkpoints_iterator":
+        "tf.train.checkpoints_iterator",
     "tf.count_nonzero":
         "tf.math.count_nonzero",
     "tf.manip.batch_to_space_nd":
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index 25c0a2d5c12..e80bdc47b82 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import ast
+import collections
 import os
 import re
 import shutil
@@ -39,6 +40,10 @@ WARNING = "WARNING"
 ERROR = "ERROR"
 
 
+ImportRename = collections.namedtuple(
+    "ImportRename", ["new_name", "excluded_prefixes"])
+
+
 def full_name_node(name, ctx=ast.Load()):
   """Make an Attribute or Name node for name.
 
@@ -101,6 +106,87 @@ def get_arg_value(node, arg_name, arg_pos=None):
   return (False, None)
 
 
+def uses_star_args_in_call(node):
+  """Check if an ast.Call node uses arbitrary-length positional *args.
+
+  This function works with the AST call node format of Python3.5+
+  as well as the different AST format of earlier versions of Python.
+
+  Args:
+    node: The ast.Call node to check arg values for.
+
+  Returns:
+    True if the node uses starred variadic positional args or keyword args.
+    False if it does not.
+  """
+  if sys.version_info[:2] >= (3, 5):
+    # Check for an *args usage in python 3.5+
+    for arg in node.args:
+      if isinstance(arg, ast.Starred):
+        return True
+  else:
+    if node.starargs:
+      return True
+  return False
+
+
+def uses_star_kwargs_in_call(node):
+  """Check if an ast.Call node uses arbitrary-length **kwargs.
+
+  This function works with the AST call node format of Python3.5+
+  as well as the different AST format of earlier versions of Python.
+
+  Args:
+    node: The ast.Call node to check arg values for.
+
+  Returns:
+    True if the node uses starred variadic positional args or keyword args.
+    False if it does not.
+  """
+  if sys.version_info[:2] >= (3, 5):
+    # Check for a **kwarg usage in python 3.5+
+    for keyword in node.keywords:
+      if keyword.arg is None:
+        return True
+  else:
+    if node.kwargs:
+      return True
+  return False
+
+
+def uses_star_args_or_kwargs_in_call(node):
+  """Check if an ast.Call node uses arbitrary-length *args or **kwargs.
+
+  This function works with the AST call node format of Python3.5+
+  as well as the different AST format of earlier versions of Python.
+
+  Args:
+    node: The ast.Call node to check arg values for.
+
+  Returns:
+    True if the node uses starred variadic positional args or keyword args.
+    False if it does not.
+  """
+  return uses_star_args_in_call(node) or uses_star_kwargs_in_call(node)
+
+
+def excluded_from_module_rename(module, import_rename_spec):
+  """Check if this module import should not be renamed.
+
+  Args:
+    module: (string) module name.
+    import_rename_spec: ImportRename instance.
+
+  Returns:
+    True if this import should not be renamed according to the
+    import_rename_spec.
+  """
+  for excluded_prefix in import_rename_spec.excluded_prefixes:
+    if module.startswith(excluded_prefix):
+      return True
+  return False
+
+
 class APIChangeSpec(object):
   """This class defines the transformations that need to happen.
 
@@ -118,10 +204,38 @@ class APIChangeSpec(object):
   * `function_transformers`: maps function names to custom handlers
   * `module_deprecations`: maps module names to warnings that will be printed
     if the module is still used after all other transformations have run
+  * `import_renames`: maps import name (must be a short name without '.')
+    to ImportRename instance.
 
   For an example, see `TFAPIChangeSpec`.
   """
 
+  def preprocess(self, root_node):  # pylint: disable=unused-argument
+    """Preprocess a parse tree. Return any produced logs and errors."""
+    return [], []
+
+  def clear_preprocessing(self):
+    """Restore this APIChangeSpec to before it preprocessed a file.
+
+    This is needed if preprocessing a file changed any rewriting rules.
+    """
+    pass
+
+
+class NoUpdateSpec(APIChangeSpec):
+  """A specification of an API change which doesn't change anything."""
+
+  def __init__(self):
+    self.function_handle = {}
+    self.function_reorders = {}
+    self.function_keyword_renames = {}
+    self.symbol_renames = {}
+    self.function_warnings = {}
+    self.change_to_function = {}
+    self.module_deprecations = {}
+    self.function_transformers = {}
+    self.import_renames = {}
+
 
 class _PastaEditVisitor(ast.NodeVisitor):
   """AST Visitor that processes function calls.
@@ -286,14 +400,19 @@ class _PastaEditVisitor(ast.NodeVisitor):
     arg_warnings = self._get_applicable_dict("function_arg_warnings",
                                              full_name, name)
 
+    variadic_args = uses_star_args_or_kwargs_in_call(node)
+
     for (kwarg, arg), (level, warning) in sorted(arg_warnings.items()):
-      present, _ = get_arg_value(node, kwarg, arg)
+      present, _ = get_arg_value(node, kwarg, arg) or variadic_args
       if present:
         warned = True
         warning_message = warning.replace("<function name>", full_name or name)
+        template = "%s called with %s argument, requires manual check: %s"
+        if variadic_args:
+          template = ("%s called with *args or **kwargs that may include %s, "
+                      "requires manual check: %s")
         self.add_log(level, node.lineno, node.col_offset,
-                     "%s called with %s argument requires manual check: %s" %
-                     (full_name or name, kwarg, warning_message))
+                     template % (full_name or name, kwarg, warning_message))
 
     return warned
 
@@ -332,6 +451,13 @@ class _PastaEditVisitor(ast.NodeVisitor):
     function_reorders = self._api_change_spec.function_reorders
 
     if full_name in function_reorders:
+      if uses_star_args_in_call(node):
+        self.add_log(WARNING, node.lineno, node.col_offset,
+                     "(Manual check required) upgrading %s may require "
+                     "re-ordering the call arguments, but it was passed "
+                     "variable-length positional *args. The upgrade "
+                     "script cannot handle these automatically." % full_name)
+
       reordered = function_reorders[full_name]
       new_keywords = []
       idx = 0
@@ -359,6 +485,13 @@ class _PastaEditVisitor(ast.NodeVisitor):
     if not renamed_keywords:
       return False
 
+    if uses_star_kwargs_in_call(node):
+      self.add_log(WARNING, node.lineno, node.col_offset,
+                   "(Manual check required) upgrading %s may require "
+                   "renaming or removing call arguments, but it was passed "
+                   "variable-length *args or **kwargs. The upgrade "
+                   "script cannot handle these automatically." %
+                   (full_name or name))
     modified = False
     new_keywords = []
     for keyword in node.keywords:
@@ -426,6 +559,15 @@ class _PastaEditVisitor(ast.NodeVisitor):
 
     parent = self._stack[-2]
 
+    if transformers:
+      if uses_star_args_or_kwargs_in_call(node):
+        self.add_log(WARNING, node.lineno, node.col_offset,
+                     "(Manual check required) upgrading %s may require "
+                     "modifying call arguments, but it was passed "
+                     "variable-length *args or **kwargs. The upgrade "
+                     "script cannot handle these automatically." %
+                     (full_name or name))
+
     for transformer in transformers:
       logs = []
       new_node = transformer(parent, node, full_name, name, logs)
@@ -466,6 +608,270 @@ class _PastaEditVisitor(ast.NodeVisitor):
 
     self.generic_visit(node)
 
+  def visit_Import(self, node):  # pylint: disable=invalid-name
+    """Handle visiting an import node in the AST.
+
+    Args:
+      node: Current Node
+    """
+    new_aliases = []
+    import_updated = False
+    import_renames = getattr(self._api_change_spec, "import_renames", {})
+    inserts_after_imports = getattr(self._api_change_spec,
+                                    "inserts_after_imports", {})
+
+    # This loop processes imports in the format
+    # import foo as f, bar as b
+    for import_alias in node.names:
+      # Look for rename based on first component of from-import.
+      # i.e. based on foo in foo.bar.
+      import_first_component = import_alias.name.split(".")[0]
+      import_rename_spec = import_renames.get(import_first_component, None)
+
+      if not import_rename_spec or excluded_from_module_rename(
+          import_alias.name, import_rename_spec):
+        new_aliases.append(import_alias)  # no change needed
+        continue
+
+      new_name = (
+          import_rename_spec.new_name +
+          import_alias.name[len(import_first_component):])
+
+      # If current import is
+      #   import foo
+      # then new import should preserve imported name:
+      #   import new_foo as foo
+      # This happens when module has just one component.
+      new_asname = import_alias.asname
+      if not new_asname and "." not in import_alias.name:
+        new_asname = import_alias.name
+
+      new_alias = ast.alias(name=new_name, asname=new_asname)
+      new_aliases.append(new_alias)
+      import_updated = True
+
+      # Insert any followup lines that should happen after this import.
+      full_import = (import_alias.name, import_alias.asname)
+      insert_offset = 1
+      for line_to_insert in inserts_after_imports.get(full_import, []):
+        assert self._stack[-1] is node
+        parent = self._stack[-2]
+
+        new_line_node = pasta.parse(line_to_insert)
+        ast.copy_location(new_line_node, node)
+        parent.body.insert(
+            parent.body.index(node) + insert_offset, new_line_node)
+        insert_offset += 1
+
+        # Insert a newline after the import if necessary
+        old_suffix = pasta.base.formatting.get(node, "suffix")
+        if old_suffix is None:
+          old_suffix = os.linesep
+        if os.linesep not in old_suffix:
+          pasta.base.formatting.set(node, "suffix", old_suffix + os.linesep)
+
+        # Apply indentation to new node.
+        pasta.base.formatting.set(new_line_node, "prefix",
+                                  pasta.base.formatting.get(node, "prefix"))
+        pasta.base.formatting.set(new_line_node, "suffix", os.linesep)
+        self.add_log(
+            INFO, node.lineno, node.col_offset,
+            "Adding `%s` after import of %s" %
+            (new_line_node, import_alias.name))
+
+    # Replace the node if at least one import needs to be updated.
+    if import_updated:
+      assert self._stack[-1] is node
+      parent = self._stack[-2]
+
+      new_node = ast.Import(new_aliases)
+      ast.copy_location(new_node, node)
+      pasta.ast_utils.replace_child(parent, node, new_node)
+      self.add_log(
+          INFO, node.lineno, node.col_offset,
+          "Changed import from %r to %r." %
+          (pasta.dump(node), pasta.dump(new_node)))
+
+    self.generic_visit(node)
+
+  def visit_ImportFrom(self, node):  # pylint: disable=invalid-name
+    """Handle visiting an import-from node in the AST.
+
+    Args:
+      node: Current Node
+    """
+    if not node.module:
+      self.generic_visit(node)
+      return
+
+    from_import = node.module
+
+    # Look for rename based on first component of from-import.
+    # i.e. based on foo in foo.bar.
+    from_import_first_component = from_import.split(".")[0]
+    import_renames = getattr(self._api_change_spec, "import_renames", {})
+    import_rename_spec = import_renames.get(from_import_first_component, None)
+    if not import_rename_spec:
+      self.generic_visit(node)
+      return
+
+    # Split module aliases into the ones that require import update
+    # and those that don't. For e.g. if we want to rename "a" to "b"
+    # unless we import "a.c" in the following:
+    # from a import c, d
+    # we want to update import for "d" but not for "c".
+    updated_aliases = []
+    same_aliases = []
+    for import_alias in node.names:
+      full_module_name = "%s.%s" % (from_import, import_alias.name)
+      if excluded_from_module_rename(full_module_name, import_rename_spec):
+        same_aliases.append(import_alias)
+      else:
+        updated_aliases.append(import_alias)
+
+    if not updated_aliases:
+      self.generic_visit(node)
+      return
+
+    assert self._stack[-1] is node
+    parent = self._stack[-2]
+
+    # Replace first component of from-import with new name.
+    new_from_import = (
+        import_rename_spec.new_name +
+        from_import[len(from_import_first_component):])
+    updated_node = ast.ImportFrom(new_from_import, updated_aliases, node.level)
+    ast.copy_location(updated_node, node)
+    pasta.ast_utils.replace_child(parent, node, updated_node)
+
+    # If some imports had to stay the same, add another import for them.
+    additional_import_log = ""
+    if same_aliases:
+      same_node = ast.ImportFrom(from_import, same_aliases, node.level,
+                                 col_offset=node.col_offset, lineno=node.lineno)
+      ast.copy_location(same_node, node)
+      parent.body.insert(parent.body.index(updated_node), same_node)
+      # Apply indentation to new node.
+      pasta.base.formatting.set(
+          same_node, "prefix",
+          pasta.base.formatting.get(updated_node, "prefix"))
+      additional_import_log = " and %r" % pasta.dump(same_node)
+
+    self.add_log(
+        INFO, node.lineno, node.col_offset,
+        "Changed import from %r to %r%s." %
+        (pasta.dump(node),
+         pasta.dump(updated_node),
+         additional_import_log))
+
+    self.generic_visit(node)
+
+
+class AnalysisResult(object):
+  """This class represents an analysis result and how it should be logged.
+
+  This class must provide the following fields:
+
+  * `log_level`: The log level to which this detection should be logged
+  * `log_message`: The message that should be logged for this detection
+
+  For an example, see `VersionedTFImport`.
+  """
+
+
+class APIAnalysisSpec(object):
+  """This class defines how `AnalysisResult`s should be generated.
+
+  It specifies how to map imports and symbols to `AnalysisResult`s.
+
+  This class must provide the following fields:
+
+  * `symbols_to_detect`: maps function names to `AnalysisResult`s
+  * `imports_to_detect`: maps imports represented as (full module name, alias)
+    tuples to `AnalysisResult`s
+    notifications)
+
+  For an example, see `TFAPIImportAnalysisSpec`.
+  """
+
+
+class PastaAnalyzeVisitor(_PastaEditVisitor):
+  """AST Visitor that looks for specific API usage without editing anything.
+
+  This is used before any rewriting is done to detect if any symbols are used
+  that require changing imports or disabling rewriting altogether.
+  """
+
+  def __init__(self, api_analysis_spec):
+    super(PastaAnalyzeVisitor, self).__init__(NoUpdateSpec())
+    self._api_analysis_spec = api_analysis_spec
+    self._results = []   # Holds AnalysisResult objects
+
+  @property
+  def results(self):
+    return self._results
+
+  def add_result(self, analysis_result):
+    self._results.append(analysis_result)
+
+  def visit_Attribute(self, node):  # pylint: disable=invalid-name
+    """Handle bare Attributes i.e. [tf.foo, tf.bar]."""
+    full_name = self._get_full_name(node)
+    if full_name:
+      detection = self._api_analysis_spec.symbols_to_detect.get(full_name, None)
+      if detection:
+        self.add_result(detection)
+        self.add_log(
+            detection.log_level, node.lineno, node.col_offset,
+            detection.log_message)
+
+    self.generic_visit(node)
+
+  def visit_Import(self, node):  # pylint: disable=invalid-name
+    """Handle visiting an import node in the AST.
+
+    Args:
+      node: Current Node
+    """
+    for import_alias in node.names:
+      # Detect based on full import name and alias)
+      full_import = (import_alias.name, import_alias.asname)
+      detection = (self._api_analysis_spec
+                   .imports_to_detect.get(full_import, None))
+      if detection:
+        self.add_result(detection)
+        self.add_log(
+            detection.log_level, node.lineno, node.col_offset,
+            detection.log_message)
+
+    self.generic_visit(node)
+
+  def visit_ImportFrom(self, node):  # pylint: disable=invalid-name
+    """Handle visiting an import-from node in the AST.
+
+    Args:
+      node: Current Node
+    """
+    if not node.module:
+      self.generic_visit(node)
+      return
+
+    from_import = node.module
+
+    for import_alias in node.names:
+      # Detect based on full import name(to & as)
+      full_module_name = "%s.%s" % (from_import, import_alias.name)
+      full_import = (full_module_name, import_alias.asname)
+      detection = (self._api_analysis_spec
+                   .imports_to_detect.get(full_import, None))
+      if detection:
+        self.add_result(detection)
+        self.add_log(
+            detection.log_level, node.lineno, node.col_offset,
+            detection.log_message)
+
+    self.generic_visit(node)
+
 
 class ASTCodeUpgrader(object):
   """Handles upgrading a set of Python files using a given API change spec."""
@@ -512,12 +918,18 @@ class ASTCodeUpgrader(object):
       log = ["ERROR: Failed to parse.\n" + traceback.format_exc()]
       return 0, "", log, []
 
+    preprocess_logs, preprocess_errors = self._api_change_spec.preprocess(t)
+
     visitor = _PastaEditVisitor(self._api_change_spec)
     visitor.visit(t)
 
-    logs = [self.format_log(log, None) for log in visitor.log]
+    self._api_change_spec.clear_preprocessing()
+
+    logs = [self.format_log(log, None) for log in (preprocess_logs +
+                                                   visitor.log)]
     errors = [self.format_log(error, in_filename)
-              for error in visitor.warnings_and_errors]
+              for error in (preprocess_errors +
+                            visitor.warnings_and_errors)]
     return 1, pasta.dump(t), logs, errors
 
   def _format_log(self, log, in_filename, out_filename):
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
index a9307f9f83b..0bc87d17d53 100644
--- a/tensorflow/tools/compatibility/ast_edits_test.py
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -16,6 +16,8 @@
 
 All of the tests assume that we want to change from an API containing
 
+    import foo as f
+
     def f(a, b, kw1, kw2): ...
     def g(a, b, kw1, c, kw1_alias): ...
     def g2(a, b, kw1, c, d, kw1_alias): ...
@@ -25,6 +27,8 @@ and the changes to the API consist of renaming, reordering, and/or removing
 arguments. Thus, we want to be able to generate changes to produce each of the
 following new APIs:
 
+    import bar as f
+
     def f(a, b, kw1, kw3): ...
     def f(a, b, kw2, kw1): ...
     def f(a, b, kw3, kw1): ...
@@ -48,28 +52,15 @@ from tensorflow.python.platform import test as test_lib
 from tensorflow.tools.compatibility import ast_edits
 
 
-class NoUpdateSpec(ast_edits.APIChangeSpec):
-  """A specification of an API change which doesn't change anything."""
-
-  def __init__(self):
-    self.function_handle = {}
-    self.function_reorders = {}
-    self.function_keyword_renames = {}
-    self.symbol_renames = {}
-    self.function_warnings = {}
-    self.change_to_function = {}
-    self.module_deprecations = {}
-
-
-class ModuleDeprecationSpec(NoUpdateSpec):
+class ModuleDeprecationSpec(ast_edits.NoUpdateSpec):
   """A specification which deprecates 'a.b'."""
 
   def __init__(self):
-    NoUpdateSpec.__init__(self)
+    ast_edits.NoUpdateSpec.__init__(self)
     self.module_deprecations.update({"a.b": (ast_edits.ERROR, "a.b is evil.")})
 
 
-class RenameKeywordSpec(NoUpdateSpec):
+class RenameKeywordSpec(ast_edits.NoUpdateSpec):
   """A specification where kw2 gets renamed to kw3.
 
   The new API is
@@ -79,14 +70,14 @@ class RenameKeywordSpec(NoUpdateSpec):
   """
 
   def __init__(self):
-    NoUpdateSpec.__init__(self)
+    ast_edits.NoUpdateSpec.__init__(self)
     self.update_renames()
 
   def update_renames(self):
     self.function_keyword_renames["f"] = {"kw2": "kw3"}
 
 
-class ReorderKeywordSpec(NoUpdateSpec):
+class ReorderKeywordSpec(ast_edits.NoUpdateSpec):
   """A specification where kw2 gets moved in front of kw1.
 
   The new API is
@@ -96,7 +87,7 @@ class ReorderKeywordSpec(NoUpdateSpec):
   """
 
   def __init__(self):
-    NoUpdateSpec.__init__(self)
+    ast_edits.NoUpdateSpec.__init__(self)
     self.update_reorders()
 
   def update_reorders(self):
@@ -120,7 +111,7 @@ class ReorderAndRenameKeywordSpec(ReorderKeywordSpec, RenameKeywordSpec):
     self.update_reorders()
 
 
-class RemoveDeprecatedAliasKeyword(NoUpdateSpec):
+class RemoveDeprecatedAliasKeyword(ast_edits.NoUpdateSpec):
   """A specification where kw1_alias is removed in g.
 
   The new API is
@@ -131,7 +122,7 @@ class RemoveDeprecatedAliasKeyword(NoUpdateSpec):
   """
 
   def __init__(self):
-    NoUpdateSpec.__init__(self)
+    ast_edits.NoUpdateSpec.__init__(self)
     self.function_keyword_renames["g"] = {"kw1_alias": "kw1"}
     self.function_keyword_renames["g2"] = {"kw1_alias": "kw1"}
 
@@ -153,7 +144,7 @@ class RemoveDeprecatedAliasAndReorderRest(RemoveDeprecatedAliasKeyword):
     self.function_reorders["g2"] = ["a", "b", "kw1", "c", "d"]
 
 
-class RemoveMultipleKeywordArguments(NoUpdateSpec):
+class RemoveMultipleKeywordArguments(ast_edits.NoUpdateSpec):
   """A specification where both keyword aliases are removed from h.
 
   The new API is
@@ -163,13 +154,25 @@ class RemoveMultipleKeywordArguments(NoUpdateSpec):
   """
 
   def __init__(self):
-    NoUpdateSpec.__init__(self)
+    ast_edits.NoUpdateSpec.__init__(self)
     self.function_keyword_renames["h"] = {
         "kw1_alias": "kw1",
         "kw2_alias": "kw2",
     }
 
 
+class RenameImports(ast_edits.NoUpdateSpec):
+  """Specification for renaming imports."""
+
+  def __init__(self):
+    ast_edits.NoUpdateSpec.__init__(self)
+    self.import_renames = {
+        "foo": ast_edits.ImportRename(
+            "bar",
+            excluded_prefixes=["foo.baz"])
+    }
+
+
 class TestAstEdits(test_util.TensorFlowTestCase):
 
   def _upgrade(self, spec, old_file_text):
@@ -192,24 +195,36 @@ class TestAstEdits(test_util.TensorFlowTestCase):
 
   def testNoTransformIfNothingIsSupplied(self):
     text = "f(a, b, kw1=c, kw2=d)\n"
-    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    _, new_text = self._upgrade(ast_edits.NoUpdateSpec(), text)
     self.assertEqual(new_text, text)
 
     text = "f(a, b, c, d)\n"
-    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    _, new_text = self._upgrade(ast_edits.NoUpdateSpec(), text)
     self.assertEqual(new_text, text)
 
   def testKeywordRename(self):
     """Test that we get the expected result if renaming kw2 to kw3."""
     text = "f(a, b, kw1=c, kw2=d)\n"
     expected = "f(a, b, kw1=c, kw3=d)\n"
-    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    (_, report, _), new_text = self._upgrade(RenameKeywordSpec(), text)
     self.assertEqual(new_text, expected)
+    self.assertNotIn("Manual check required", report)
 
     # No keywords specified, no reordering, so we should get input as output
     text = "f(a, b, c, d)\n"
-    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    (_, report, _), new_text = self._upgrade(RenameKeywordSpec(), text)
     self.assertEqual(new_text, text)
+    self.assertNotIn("Manual check required", report)
+
+    # Positional *args passed in that we cannot inspect, should warn
+    text = "f(a, *args)\n"
+    (_, report, _), _ = self._upgrade(RenameKeywordSpec(), text)
+    self.assertNotIn("Manual check required", report)
+
+    # **kwargs passed in that we cannot inspect, should warn
+    text = "f(a, b, kw1=c, **kwargs)\n"
+    (_, report, _), _ = self._upgrade(RenameKeywordSpec(), text)
+    self.assertIn("Manual check required", report)
 
   def testKeywordReorderWithParens(self):
     """Test that we get the expected result if there are parens around args."""
@@ -237,8 +252,9 @@ class TestAstEdits(test_util.TensorFlowTestCase):
         "f(a=a, b=b, kw1=c, kw2=d)\n",
         "f(a=a, b=b, kw2=d, kw1=c)\n",
     ]
-    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    (_, report, _), new_text = self._upgrade(ReorderKeywordSpec(), text)
     self.assertIn(new_text, acceptable_outputs)
+    self.assertNotIn("Manual check required", report)
 
     # Keywords are reordered, so we should reorder arguments too
     text = "f(a, b, c, d)\n"
@@ -247,8 +263,19 @@ class TestAstEdits(test_util.TensorFlowTestCase):
         "f(a=a, b=b, kw1=c, kw2=d)\n",
         "f(a=a, b=b, kw2=d, kw1=c)\n",
     ]
-    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    (_, report, _), new_text = self._upgrade(ReorderKeywordSpec(), text)
     self.assertIn(new_text, acceptable_outputs)
+    self.assertNotIn("Manual check required", report)
+
+    # Positional *args passed in that we cannot inspect, should warn
+    text = "f(a, b, *args)\n"
+    (_, report, _), _ = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn("Manual check required", report)
+
+    # **kwargs passed in that we cannot inspect, should warn
+    text = "f(a, b, kw1=c, **kwargs)\n"
+    (_, report, _), _ = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertNotIn("Manual check required", report)
 
   def testKeywordReorderAndRename(self):
     """Test that we get the expected result if kw2 is renamed and moved."""
@@ -258,8 +285,10 @@ class TestAstEdits(test_util.TensorFlowTestCase):
         "f(a=a, b=b, kw1=c, kw3=d)\n",
         "f(a=a, b=b, kw3=d, kw1=c)\n",
     ]
-    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    (_, report, _), new_text = self._upgrade(
+        ReorderAndRenameKeywordSpec(), text)
     self.assertIn(new_text, acceptable_outputs)
+    self.assertNotIn("Manual check required", report)
 
     # Keywords are reordered, so we should reorder arguments too
     text = "f(a, b, c, d)\n"
@@ -268,8 +297,20 @@ class TestAstEdits(test_util.TensorFlowTestCase):
         "f(a=a, b=b, kw1=c, kw3=d)\n",
         "f(a=a, b=b, kw3=d, kw1=c)\n",
     ]
-    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    (_, report, _), new_text = self._upgrade(
+        ReorderAndRenameKeywordSpec(), text)
     self.assertIn(new_text, acceptable_outputs)
+    self.assertNotIn("Manual check required", report)
+
+    # Positional *args passed in that we cannot inspect, should warn
+    text = "f(a, *args, kw1=c)\n"
+    (_, report, _), _ = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn("Manual check required", report)
+
+    # **kwargs passed in that we cannot inspect, should warn
+    text = "f(a, b, kw1=c, **kwargs)\n"
+    (_, report, _), _ = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn("Manual check required", report)
 
   def testRemoveDeprecatedKeywordAlias(self):
     """Test that we get the expected result if a keyword alias is removed."""
@@ -430,11 +471,11 @@ class TestAstEdits(test_util.TensorFlowTestCase):
     self.assertIn(new_text, acceptable_outputs)
 
   def testUnrestrictedFunctionWarnings(self):
-    class FooWarningSpec(NoUpdateSpec):
+    class FooWarningSpec(ast_edits.NoUpdateSpec):
       """Usages of function attribute foo() prints out a warning."""
 
       def __init__(self):
-        NoUpdateSpec.__init__(self)
+        ast_edits.NoUpdateSpec.__init__(self)
         self.function_warnings = {"*.foo": (ast_edits.WARNING, "not good")}
 
     texts = ["object.foo()", "get_object().foo()",
@@ -458,5 +499,112 @@ class TestAstEdits(test_util.TensorFlowTestCase):
         "ctx=Load()), attr='c', ctx=Load())"
     )
 
+  def testImport(self):
+    # foo should be renamed to bar.
+    text = "import foo as f"
+    expected_text = "import bar as f"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "import foo"
+    expected_text = "import bar as foo"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "import foo.test"
+    expected_text = "import bar.test"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "import foo.test as t"
+    expected_text = "import bar.test as t"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "import foo as f, a as b"
+    expected_text = "import bar as f, a as b"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+  def testFromImport(self):
+    # foo should be renamed to bar.
+    text = "from foo import a"
+    expected_text = "from bar import a"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "from foo.a import b"
+    expected_text = "from bar.a import b"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "from foo import *"
+    expected_text = "from bar import *"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "from foo import a, b"
+    expected_text = "from bar import a, b"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+  def testImport_NoChangeNeeded(self):
+    text = "import bar as b"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(text, new_text)
+
+  def testFromImport_NoChangeNeeded(self):
+    text = "from bar import a as b"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(text, new_text)
+
+  def testExcludedImport(self):
+    # foo.baz module is excluded from changes.
+    text = "import foo.baz"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(text, new_text)
+
+    text = "import foo.baz as a"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(text, new_text)
+
+    text = "from foo import baz as a"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(text, new_text)
+
+    text = "from foo.baz import a"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(text, new_text)
+
+  def testMultipleImports(self):
+    text = "import foo.bar as a, foo.baz as b, foo.baz.c, foo.d"
+    expected_text = "import bar.bar as a, foo.baz as b, foo.baz.c, bar.d"
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "from foo import baz, a, c"
+    expected_text = """from foo import baz
+from bar import a, c"""
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+  def testImportInsideFunction(self):
+    text = """
+def t():
+  from c import d
+  from foo import baz, a
+  from e import y
+"""
+    expected_text = """
+def t():
+  from c import d
+  from foo import baz
+  from bar import a
+  from e import y
+"""
+    _, new_text = self._upgrade(RenameImports(), text)
+    self.assertEqual(expected_text, new_text)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 201925889ef..397505661a5 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -269,6 +269,8 @@ renames = {
         'tf.compat.dimension_at_index',
     'tf.dimension_value':
         'tf.compat.dimension_value',
+    'tf.disable_control_flow_v2':
+        'tf.compat.v1.disable_control_flow_v2',
     'tf.disable_eager_execution':
         'tf.compat.v1.disable_eager_execution',
     'tf.disable_resource_variables':
@@ -321,6 +323,8 @@ renames = {
         'tf.math.divide_no_nan',
     'tf.dtypes.as_string':
         'tf.strings.as_string',
+    'tf.enable_control_flow_v2':
+        'tf.compat.v1.enable_control_flow_v2',
     'tf.enable_eager_execution':
         'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables':
@@ -1439,6 +1443,10 @@ renames = {
         'tf.compat.v1.train.do_quantize_training_on_graphdef',
     'tf.train.experimental.MixedPrecisionLossScaleOptimizer':
         'tf.compat.v1.train.experimental.MixedPrecisionLossScaleOptimizer',
+    'tf.train.experimental.disable_mixed_precision_graph_rewrite':
+        'tf.compat.v1.train.experimental.disable_mixed_precision_graph_rewrite',
+    'tf.train.experimental.enable_mixed_precision_graph_rewrite':
+        'tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite',
     'tf.train.exponential_decay':
         'tf.compat.v1.train.exponential_decay',
     'tf.train.export_meta_graph':
@@ -1543,6 +1551,10 @@ renames = {
         'tf.compat.v1.variables_initializer',
     'tf.verify_tensor_all_finite':
         'tf.compat.v1.verify_tensor_all_finite',
+    'tf.where':
+        'tf.compat.v1.where',
+    'tf.where_v2':
+        'tf.compat.v2.where',
     'tf.wrap_function':
         'tf.compat.v1.wrap_function',
     'tf.write_file':
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index e55ad592bff..24c55d2113f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -34,7 +34,35 @@ from tensorflow.tools.compatibility import reorders_v2
 # pylint: disable=g-explicit-bool-comparison,g-bool-id-comparison
 
 
-class TFAPIChangeSpec(ast_edits.APIChangeSpec):
+class UnaliasedTFImport(ast_edits.AnalysisResult):
+
+  def __init__(self):
+    self.log_level = ast_edits.ERROR
+    self.log_message = ("The tf_upgrade_v2 script detected an unaliased "
+                        "`import tensorflow`. The script can only run when "
+                        "importing with `import tensorflow as tf`.")
+
+
+class VersionedTFImport(ast_edits.AnalysisResult):
+
+  def __init__(self, version):
+    self.log_level = ast_edits.INFO
+    self.log_message = ("Not upgrading symbols because `tensorflow." + version
+                        + "` was directly imported as `tf`.")
+
+
+class TFAPIImportAnalysisSpec(ast_edits.APIAnalysisSpec):
+
+  def __init__(self):
+    self.symbols_to_detect = {}
+    self.imports_to_detect = {
+        ("tensorflow", None): UnaliasedTFImport(),
+        ("tensorflow.compat.v1", "tf"): VersionedTFImport("compat.v1"),
+        ("tensorflow.compat.v2", "tf"): VersionedTFImport("compat.v2"),
+    }
+
+
+class TFAPIChangeSpec(ast_edits.NoUpdateSpec):
   """List of maps that describe what changed in the API."""
 
   def __init__(self):
@@ -481,6 +509,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # Add additional renames not in renames_v2.py to all_renames_v2.py.
     self.symbol_renames = all_renames_v2.symbol_renames
 
+    self.import_renames = {}
+
     # Variables that should be changed to functions.
     self.change_to_function = {}
 
@@ -771,7 +801,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "extended.call_for_each_replica->experimental_run_v2, "
         "reduce requires an axis argument, "
         "unwrap->experimental_local_results "
-        "experimental_initialize and experimenta_finalize no longer needed ")
+        "experimental_initialize and experimental_finalize no longer needed ")
 
     contrib_mirrored_strategy_warning = (
         ast_edits.ERROR,
@@ -1474,6 +1504,28 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
 
     self.module_deprecations = module_deprecations_v2.MODULE_DEPRECATIONS
 
+  def preprocess(self, root_node):
+    visitor = ast_edits.PastaAnalyzeVisitor(TFAPIImportAnalysisSpec())
+    visitor.visit(root_node)
+    detections = set(visitor.results)
+    # If we have detected the presence of imports of specific TF versions,
+    # We want to modify the update spec to check only module deprecations
+    # and skip all other conversions.
+    if detections:
+      self.function_handle = {}
+      self.function_reorders = {}
+      self.function_keyword_renames = {}
+      self.symbol_renames = {}
+      self.function_warnings = {}
+      self.change_to_function = {}
+      self.module_deprecations = module_deprecations_v2.MODULE_DEPRECATIONS
+      self.function_transformers = {}
+      self.import_renames = {}
+    return visitor.log, visitor.warnings_and_errors
+
+  def clear_preprocessing(self):
+    self.__init__()
+
 
 def _is_ast_str(node):
   """Determine whether this node represents a string."""
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
index 3c4263ed809..63e1291297f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -19,21 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import sys
 
 from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import ipynb
 from tensorflow.tools.compatibility import tf_upgrade_v2
 from tensorflow.tools.compatibility import tf_upgrade_v2_safety
-from tensorflow.tools.compatibility import ipynb
-
 # Make straightforward changes to convert to 2.0. In harder cases,
 # use compat.v1.
 _DEFAULT_MODE = "DEFAULT"
 
 # Convert to use compat.v1.
-# TODO(kaftan): remove EXPERIMENTAL_ prefix once safety mode is
-# implemented.
-_SAFETY_MODE = "EXPERIMENTAL_SAFETY"
+_SAFETY_MODE = "SAFETY"
 
 
 def process_file(in_filename, out_filename, upgrader):
@@ -110,15 +106,19 @@ Simple usage:
             "%s: Perform only straightforward conversions to upgrade to "
             "2.0. In more difficult cases, switch to use compat.v1.\n"
             "%s: Keep 1.* code intact and import compat.v1 "
-            "module. Note: safety mode is under development and not available "
-            "yet." % (_DEFAULT_MODE, _SAFETY_MODE)),
+            "module. Also disable 2.0 behavior to ensure code "
+            "that requires 1.X behavior continues to work." %
+            (_DEFAULT_MODE, _SAFETY_MODE)),
       default=_DEFAULT_MODE)
+  parser.add_argument(
+      "--print_all",
+      dest="print_all",
+      help="Print full log to stdout instead of just printing errors",
+      action="store_true")
   args = parser.parse_args()
 
   if args.mode == _SAFETY_MODE:
     change_spec = tf_upgrade_v2_safety.TFAPIChangeSpec()
-    sys.stderr.write(
-        "%s mode is not fully implemented yet." % _SAFETY_MODE)
   else:
     change_spec = tf_upgrade_v2.TFAPIChangeSpec()
   upgrade = ast_edits.ASTCodeUpgrader(change_spec)
@@ -168,14 +168,21 @@ Simple usage:
               "Converted %d files\n" % files_processed +
               "Detected %d issues that require attention" % num_errors + "\n" +
               "-" * 80 + "\n") + "".join(report)
+    detailed_report_header = "=" * 80 + "\n"
+    detailed_report_header += "Detailed log follows:\n\n"
+    detailed_report_header += "=" * 80 + "\n"
+
     with open(report_filename, "w") as report_file:
       report_file.write(report)
-      report_file.write("=" * 80 + "\n")
-      report_file.write("Detailed log follows:\n\n")
-      report_file.write("=" * 80 + "\n")
+      report_file.write(detailed_report_header)
       report_file.write(report_text)
 
-    print(report)
+    if args.print_all:
+      print(report)
+      print(detailed_report_header)
+      print(report_text)
+    else:
+      print(report)
     print("\nMake sure to read the detailed log %r\n" % report_filename)
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_safety.py b/tensorflow/tools/compatibility/tf_upgrade_v2_safety.py
index 02ade7bb812..047bd579921 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_safety.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_safety.py
@@ -34,5 +34,21 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     self.function_transformers = {}
     self.module_deprecations = module_deprecations_v2.MODULE_DEPRECATIONS
 
+    # List module renames. Right now, we just support renames from a module
+    # names that don't contain '.'.
+    self.import_renames = {
+        "tensorflow": ast_edits.ImportRename(
+            "tensorflow.compat.v1",
+            excluded_prefixes=["tensorflow.contrib",
+                               "tensorflow.flags",
+                               "tensorflow.compat.v1",
+                               "tensorflow.compat.v2"])
+    }
+
+    self.inserts_after_imports = {
+        ("tensorflow", None): ["tensorflow.disable_v2_behavior()"],
+        ("tensorflow", "tf"): ["tf.disable_v2_behavior()"],
+    }
+
     # TODO(kaftan,annarev): specify replacement from TensorFlow import to
     # compat.v1 import.
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_safety_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_safety_test.py
index 8890d631c34..ca2c6473164 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_safety_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_safety_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import six
 
 from tensorflow.python.framework import test_util
@@ -42,6 +44,95 @@ class TfUpgradeV2SafetyTest(test_util.TensorFlowTestCase):
     expected_info = "tf.contrib will not be distributed"
     self.assertIn(expected_info, report)
 
+  def testTensorFlowImport(self):
+    text = "import tensorflow as tf"
+    expected_text = ("import tensorflow.compat.v1 as tf" + os.linesep +
+                     "tf.disable_v2_behavior()" + os.linesep)
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "import tensorflow as tf, other_import as y"
+    expected_text = ("import tensorflow.compat.v1 as tf, other_import as y" +
+                     os.linesep + "tf.disable_v2_behavior()" + os.linesep)
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "import tensorflow"
+    expected_text = ("import tensorflow.compat.v1 as tensorflow" + os.linesep +
+                     "tensorflow.disable_v2_behavior()" + os.linesep)
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "import tensorflow.foo"
+    expected_text = "import tensorflow.compat.v1.foo"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "import tensorflow.foo as bar"
+    expected_text = "import tensorflow.compat.v1.foo as bar"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def testTensorFlowImportInIndent(self):
+    text = """
+try:
+  import tensorflow as tf  # import line
+
+  tf.ones([4, 5])
+except AttributeError:
+  pass
+"""
+
+    expected_text = """
+try:
+  import tensorflow.compat.v1 as tf  # import line
+  tf.disable_v2_behavior()
+
+  tf.ones([4, 5])
+except AttributeError:
+  pass
+"""
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def testTensorFlowFromImport(self):
+    text = "from tensorflow import foo"
+    expected_text = "from tensorflow.compat.v1 import foo"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "from tensorflow.foo import bar"
+    expected_text = "from tensorflow.compat.v1.foo import bar"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = "from tensorflow import *"
+    expected_text = "from tensorflow.compat.v1 import *"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def testTensorFlowImportAlreadyHasCompat(self):
+    text = "import tensorflow.compat.v1 as tf"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+
+    text = "import tensorflow.compat.v2 as tf"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+
+    text = "from tensorflow.compat import v2 as tf"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+
+  def testTensorFlowDontChangeContrib(self):
+    text = "import tensorflow.contrib as foo"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+
+    text = "from tensorflow import contrib"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index f02482a9eb1..ab292ccf712 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -123,6 +123,18 @@ class TestUpgrade(test_util.TensorFlowTestCase, parameterized.TestCase):
                                      "test_out.py", out_file))
     return count, report, errors, out_file.getvalue()
 
+  def _upgrade_multiple(self, old_file_texts):
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+    results = []
+    for old_file_text in old_file_texts:
+      in_file = six.StringIO(old_file_text)
+      out_file = six.StringIO()
+      count, report, errors = (
+          upgrader.process_opened_file("test.py", in_file,
+                                       "test_out.py", out_file))
+      results.append([count, report, errors, out_file.getvalue()])
+    return results
+
   def testParseError(self):
     _, report, unused_errors, unused_new_text = self._upgrade(
         "import tensorflow as tf\na + \n")
@@ -1984,6 +1996,97 @@ def _log_prob(self, x):
     _, _, _, new_text = self._upgrade(text)
     self.assertEqual(expected_text, new_text)
 
+  def test_import_analysis(self):
+    old_symbol = "tf.conj(a)"
+    new_symbol = "tf.math.conj(a)"
+
+    # We upgrade the base un-versioned tensorflow aliased as tf
+    import_header = "import tensorflow as tf\n"
+    text = import_header + old_symbol
+    expected_text = import_header + new_symbol
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    import_header = ("import tensorflow as tf\n"
+                     "import tensorflow.compat.v1 as tf_v1\n"
+                     "import tensorflow.compat.v2 as tf_v2\n")
+    text = import_header + old_symbol
+    expected_text = import_header + new_symbol
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    # We don't handle unaliased tensorflow imports currently,
+    # So the upgrade script show log errors
+    import_header = "import tensorflow\n"
+    text = import_header + old_symbol
+    expected_text = import_header + old_symbol
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn("unaliased `import tensorflow`", "\n".join(errors))
+
+    # Upgrading explicitly-versioned tf code is unsafe, but we don't
+    # need to throw errors when we detect explicitly-versioned tf.
+    import_header = "import tensorflow.compat.v1 as tf\n"
+    text = import_header + old_symbol
+    expected_text = import_header + old_symbol
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn("`tensorflow.compat.v1` was directly imported as `tf`",
+                  report)
+    self.assertEmpty(errors)
+
+    import_header = "from tensorflow.compat import v1 as tf\n"
+    text = import_header + old_symbol
+    expected_text = import_header + old_symbol
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn("`tensorflow.compat.v1` was directly imported as `tf`",
+                  report)
+    self.assertEmpty(errors)
+
+    import_header = "from tensorflow.compat import v1 as tf, v2 as tf2\n"
+    text = import_header + old_symbol
+    expected_text = import_header + old_symbol
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn("`tensorflow.compat.v1` was directly imported as `tf`",
+                  report)
+    self.assertEmpty(errors)
+
+    import_header = "import tensorflow.compat.v2 as tf\n"
+    text = import_header + old_symbol
+    expected_text = import_header + old_symbol
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn("`tensorflow.compat.v2` was directly imported as `tf`",
+                  report)
+    self.assertEmpty(errors)
+
+    import_header = "from tensorflow.compat import v1 as tf1, v2 as tf\n"
+    text = import_header + old_symbol
+    expected_text = import_header + old_symbol
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn("`tensorflow.compat.v2` was directly imported as `tf`",
+                  report)
+    self.assertEmpty(errors)
+
+  def test_api_spec_reset_between_files(self):
+    for old_symbol, new_symbol in [
+        ("tf.conj(a)", "tf.math.conj(a)"),
+        ("tf.to_int32(x)", "tf.cast(x, dtype=tf.int32)")]:
+
+      ## Test that the api spec is reset in between files:
+      import_header = "import tensorflow.compat.v2 as tf\n"
+      text_a = import_header + old_symbol
+      expected_text_a = import_header + old_symbol
+      text_b = old_symbol
+      expected_text_b = new_symbol
+      results = self._upgrade_multiple([text_a, text_b])
+      result_a, result_b = results[0], results[1]
+      self.assertEqual(result_a[3], expected_text_a)
+      self.assertEqual(result_b[3], expected_text_b)
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index 1758e0ec9f9..5a74271e882 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -1,6 +1,7 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:private"])
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 py_binary(
     name = "generate_v2_renames_map",
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
index fda6b86fc2c..329a9bb94ec 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -50,6 +50,8 @@ INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
                            r"google::protobuf::internal::ArenaImpl::AddCleanup|" # for contrib/data/_prefetching_ops
                            r"google::protobuf::internal::LogMessage|" # for contrib/data/_prefetching_ops
                            r"google::protobuf::Arena::OnArenaAllocation|" # for contrib/data/_prefetching_ops
+                           r"absl::Mutex::ReaderLock|" # for //tensorflow/contrib/rnn:python/ops/_gru_ops.so and more ops
+                           r"absl::Mutex::ReaderUnlock|" # for //tensorflow/contrib/rnn:python/ops/_gru_ops.so and more ops
                            r"tensorflow::internal::LogMessage|"
                            r"tensorflow::internal::LogString|"
                            r"tensorflow::internal::CheckOpMessageBuilder|"
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index bd01a588bb1..28ab89f285c 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -1,14 +1,13 @@
 # Description:
 #   Doc generator
 
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 7b3796e01f8..eccde86758c 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -51,6 +51,22 @@ parser.tf_inspect = tf_inspect
 # So patch `tf.__all__` to list everything.
 tf.__all__ = [item_name for item_name, value in tf_inspect.getmembers(tf)]
 
+tf.__doc__ = """
+## Tensorflow 2.0 Preview
+
+Caution:  This is a developer preview.  You will likely find some bugs,
+performance issues, and more, and we encourage you to tell us about them.
+We value your feedback!
+
+These docs were generated from a nightly build of tensorflow 2.0.
+
+You can install the exact version that was used to generate these docs
+with:
+
+```
+pip install tf-nightly-2.0-preview=={version}",
+```
+""".format(version=tf.__version__)
 
 FLAGS = flags.FLAGS
 
@@ -126,7 +142,8 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
   except AttributeError:
     pass
 
-  base_dir = path.dirname(tf.__file__)
+  base_dir = path.normpath(path.join(path.dirname(tf.__file__), "../.."))
+
   base_dirs = (
       base_dir,
       # External packages base directories,
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index 8dc4d07bd75..fb6a07133e7 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -3,9 +3,10 @@
 # Also contains information about git repository deposited by configure
 # in gen/...
 
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 py_binary(
     name = "gen_git_source",
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 4d52c1fccf9..d29b535ae30 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -26,6 +26,7 @@ NOTE: this script is only used in opensource.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from builtins import bytes  # pylint: disable=redefined-builtin
 import argparse
 import json
 import os
@@ -202,14 +203,14 @@ const char* tf_compiler_version() {
   return __VERSION__;
 #endif
 }
-const int tf_cxx11_abi_flag() {
+int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
 #else
   return 0;
 #endif
 }
-const int tf_monolithic_build() {
+int tf_monolithic_build() {
 #ifdef TENSORFLOW_MONOLITHIC_BUILD
   return 1;
 #else
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index cd128af6b36..258bfc67222 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -37,14 +37,14 @@ const char* tf_compiler_version() {
   return __VERSION__;
 #endif
 }
-const int tf_cxx11_abi_flag() {
+int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
 #else
   return 0;
 #endif
 }
-const int tf_monolithic_build() {
+int tf_monolithic_build() {
 #ifdef TENSORFLOW_MONOLITHIC_BUILD
   return 1;
 #else
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 2145b3b0d5b..80dd9ce3b4e 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -1,18 +1,19 @@
 # Description:
 #   Utilities that perform useful transformations on graphs
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_not_v2",
     "if_not_windows",
-    "tf_copts",
     "tf_cc_binary",
     "tf_cc_test",
+    "tf_copts",
     "tf_py_test",
-    "if_not_v2",
 )
 
 exports_files(["LICENSE"])
@@ -98,7 +99,6 @@ cc_library(
         "fold_old_batch_norms.cc",
         "freeze_requantization_ranges.cc",
         "fuse_convolutions.cc",
-        "fuse_quantized_convolution.cc",
         "insert_logging.cc",
         "obfuscate_names.cc",
         "quantize_nodes.cc",
diff --git a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
index d97496cbeb1..7e2171b8abc 100644
--- a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
+++ b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
@@ -40,8 +40,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
   for (const string& file_line : file_lines) {
     // We expect to find a line with components separated by semicolons, so to
     // start make sure that the basic structure is in place/
-    if (!str_util::StrContains(file_line,
-                               print_suffix + ";" + requant_prefix)) {
+    if (!absl::StrContains(file_line, print_suffix + ";" + requant_prefix)) {
       continue;
     }
     std::vector<string> line_parts = str_util::Split(file_line, ';');
@@ -53,7 +52,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
     bool min_max_found = false;
     int min_max_index;
     for (int i = 1; i < line_parts.size(); ++i) {
-      if (str_util::StartsWith(line_parts[i], requant_prefix)) {
+      if (absl::StartsWith(line_parts[i], requant_prefix)) {
         min_max_found = true;
         min_max_index = i;
       }
diff --git a/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc b/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
deleted file mode 100644
index 5aa2dd4f99b..00000000000
--- a/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifdef INTEL_MKL
-#include <algorithm>
-
-#include "tensorflow/core/common_runtime/constant_folding.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/graph/subgraph.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
-#include "tensorflow/tools/graph_transforms/transform_utils.h"
-
-namespace tensorflow {
-namespace graph_transforms {
-
-Status FuseQuantizedConvolutionAndRequantize(
-    const GraphDef& input_graph_def, const TransformFuncContext& context,
-    GraphDef* output_graph_def) {
-  std::map<string, const NodeDef*> node_map;
-  MapNamesToNodes(input_graph_def, &node_map);
-  GraphDef replaced_graph_def;
-  TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
-      input_graph_def,  // clang-format off
-
-      {"Requantize",
-        {
-          {"QuantizedConv2D|QuantizedConv2DWithBias|QuantizedConv2DWithRelu|"
-            "QuantizedConv2DWithBiasAndRelu|QuantizedConv2DWithBiasSumAndRelu"},
-          {"QuantizedConv2D|QuantizedConv2DWithBias|QuantizedConv2DWithRelu|"
-           "QuantizedConv2DWithBiasAndRelu|QuantizedConv2DWithBiasSumAndRelu"},
-          {"QuantizedConv2D|QuantizedConv2DWithBias|QuantizedConv2DWithRelu|"
-           "QuantizedConv2DWithBiasAndRelu|QuantizedConv2DWithBiasSumAndRelu"},
-          {"Const"},
-          {"Const"}
-        }
-      },  // clang-format on */
-      [&node_map](const NodeMatch& match, const std::set<string>& input_nodes,
-         const std::set<string>& output_nodes,
-         std::vector<NodeDef>* new_nodes) {
-        // TODO(mdfaijul/sheng): Current implementation assumed all
-        // requantization cases have bias. Index of inputs need to be updated
-        // for non-bias cases.
-
-        // Find all the nodes we expect in the subgraph.
-        const NodeDef& requantize_node = match.node;
-        CHECK_EQ("Requantize", requantize_node.op());
-        const NodeDef& quantized_conv2D_node = match.inputs[0].node;
-        const NodeDef& const_requantize_range_min_node = match.inputs[3].node;
-        CHECK_EQ("Const", const_requantize_range_min_node.op());
-        const NodeDef& const_requantize_range_max_node = match.inputs[4].node;
-        CHECK_EQ("Const", const_requantize_range_max_node.op());
-
-        string quantized_conv2D_op_name = quantized_conv2D_node.op();
-        // Set up the new fused version of the convolution op.
-        NodeDef fused_conv;
-        fused_conv.set_op(quantized_conv2D_op_name + "AndRequantize");
-        fused_conv.set_name(match.node.name());
-        int n_input = quantized_conv2D_node.input_size();
-        if (quantized_conv2D_op_name.compare(
-                "QuantizedConv2DWithBiasSumAndRelu") == 0)
-          n_input -= 1;  // -1 since summand is moved after frozen min-max
-
-        for (int i=0; i < n_input; i++)
-          AddNodeInput(quantized_conv2D_node.input(i), &fused_conv);
-
-        AddNodeInput(const_requantize_range_min_node.name(), &fused_conv);
-        AddNodeInput(const_requantize_range_max_node.name(), &fused_conv);
-
-        // Add additional inputs to
-        // QuantizedConv2DWithBiasSumAndReluAndRequantize
-        if (quantized_conv2D_op_name.compare(
-              "QuantizedConv2DWithBiasSumAndRelu") == 0) {
-          const NodeDef *in_requantize = node_map[node_map[
-              quantized_conv2D_node.input(n_input)]->input(0)];
-          string summand(in_requantize->name());
-          string min_summand(in_requantize->name() + ":1");
-          string max_summand(in_requantize->name() + ":2");
-          AddNodeInput(summand, &fused_conv);
-          AddNodeInput(min_summand, &fused_conv);
-          AddNodeInput(max_summand, &fused_conv);
-
-          // Signed version QuantizedConv2DWithBiasSumAndReluAndRequantize
-          // if Relu does not follow the convolution operation
-          std::vector<string> signed_ops = {
-              "QuantizedConv2DWithBias",
-              "QuantizedConv2D"
-              };
-          bool is_signed_summand =
-              std::find(signed_ops.begin(), signed_ops.end(),
-              node_map[in_requantize->input(0)]->op()) != signed_ops.end();
-          if (is_signed_summand) {
-            fused_conv.set_op(
-                "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize");
-            SetNodeAttr("Tsummand", DT_QINT8, &fused_conv);
-          } else {
-            SetNodeAttr("Tsummand", DT_QUINT8, &fused_conv);
-          }
-        }
-        CopyNodeAttr(quantized_conv2D_node, "Tinput", "Tinput", &fused_conv);
-        CopyNodeAttr(quantized_conv2D_node, "Tfilter", "Tfilter", &fused_conv);
-        CopyNodeAttr(quantized_conv2D_node, "strides", "strides", &fused_conv);
-        CopyNodeAttr(quantized_conv2D_node, "padding", "padding", &fused_conv);
-
-        // Copy dilation attribute if exsit in the orginal node
-        if (HasNodeAttr(quantized_conv2D_node, "dilations"))
-          CopyNodeAttr(quantized_conv2D_node, "dilations",
-                       "dilations", &fused_conv);
-        if (quantized_conv2D_op_name.compare("QuantizedConv2D") == 0 ||
-           quantized_conv2D_op_name.compare("QuantizedConv2DWithBias") == 0)
-          SetNodeAttr("out_type", DT_QINT8, &fused_conv);
-        else
-          SetNodeAttr("out_type", DT_QUINT8, &fused_conv);
-        new_nodes->push_back(fused_conv);
-        new_nodes->push_back(const_requantize_range_min_node);
-        new_nodes->push_back(const_requantize_range_max_node);
-
-        return Status::OK();
-      },
-      {}, &replaced_graph_def));
-
-  // Convert bias float -> int32 on replaced_graph_def
-  std::vector<string> fused_requantized_bias_ops = {
-      "QuantizedConv2DWithBiasAndRequantize",
-      "QuantizedConv2DWithBiasAndReluAndRequantize",
-      "QuantizedConv2DWithBiasSumAndReluAndRequantize",
-      "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
-      };
-  node_map.clear();
-  MapNamesToNodes(replaced_graph_def, &node_map);
-  for (auto& node_pair : node_map) {
-    const NodeDef *node = node_pair.second;
-    bool is_fused_requantized_conv_op =
-        std::find(fused_requantized_bias_ops.begin(),
-                  fused_requantized_bias_ops.end(),
-                  node->op()) != fused_requantized_bias_ops.end();
-    if (is_fused_requantized_conv_op) {
-      // If the op is not fed by Another Requantize op,
-      // then we coonvert bias as Int32
-      string input_op = node_map[NodeNameFromInput(node->input(0))]->op();
-      if (str_util::StartsWith(input_op, "QuantizedConv2D") &&
-          str_util::EndsWith(input_op, "AndRequantize")) {
-        NodeDef *bias_node = const_cast<NodeDef*>(node_map[NodeNameFromInput(
-            node->input(2))]);
-        const NodeDef *min_input_node = node_map[NodeNameFromInput(
-            node_map[node->input(0)]->input(7))];
-        const NodeDef *max_input_node = node_map[NodeNameFromInput(
-            node_map[node->input(0)]->input(8))];
-        const NodeDef *min_filter_node = node_map[NodeNameFromInput(
-            node->input(5))];
-        const NodeDef *max_filter_node = node_map[NodeNameFromInput(
-            node->input(6))];
-        const float min_input =
-            GetNodeTensorAttr(*min_input_node, "value").flat<float>()(0);
-        const float max_input =
-            GetNodeTensorAttr(*max_input_node, "value").flat<float>()(0);
-        const float min_filter =
-            GetNodeTensorAttr(*min_filter_node, "value").flat<float>()(0);
-        const float max_filter =
-            GetNodeTensorAttr(*max_filter_node, "value").flat<float>()(0);
-
-        TensorProto float_tensor_proto = bias_node->attr().at("value").tensor();
-        Tensor float_tensor;
-        if(!float_tensor.FromProto(float_tensor_proto)) {
-          TF_RETURN_IF_ERROR(::tensorflow::errors::InvalidArgument(
-              "TensorProto object is not valid."));
-        }
-        if (float_tensor.dtype() != DT_FLOAT) {
-          TF_RETURN_IF_ERROR(::tensorflow::errors::Unimplemented(
-              "Expected float tensor."));
-        }
-        float *p_bias_float = float_tensor.flat<float>().data();
-
-        Tensor int32_tensor = Tensor(DT_QINT32, float_tensor.shape());
-        qint32 *p_bias_int32 = int32_tensor.flat<qint32>().data();
-
-        float bias_scale = 255.0 * 127.0 /
-            (std::max(std::abs(max_input), std::abs(min_input)) *
-            std::max(std::abs(max_filter), std::abs(min_filter)));
-        int64 nelems = float_tensor.NumElements();
-        for (int64 n = 0; n < nelems; n++)
-          p_bias_int32[n] = (int32_t) (p_bias_float[n] * bias_scale);
-
-        bias_node->clear_attr();
-        AttrValue attr_type;
-        attr_type.set_type(int32_tensor.dtype());
-        bias_node->mutable_attr()->insert({"dtype", attr_type});
-
-        AttrValue attr_tensor;
-        TensorProto* t = attr_tensor.mutable_tensor();
-        int32_tensor.AsProtoTensorContent(t);
-        bias_node->mutable_attr()->insert({"value", attr_tensor});
-        SetNodeAttr("Tbias", DT_QINT32, const_cast<NodeDef*>(node));
-      } else {
-        SetNodeAttr("Tbias", DT_FLOAT, const_cast<NodeDef*>(node));
-      }
-    }
-  }
-  *output_graph_def = replaced_graph_def;
-  return Status::OK();
-}
-
-REGISTER_GRAPH_TRANSFORM("fuse_quantized_conv_and_requantize",
-                         FuseQuantizedConvolutionAndRequantize);
-
-}  // namespace graph_transforms
-}  // namespace tensorflow
-#endif // INTEL_MKL
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index ccc48540eb9..e7978627541 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -101,7 +101,7 @@ Status InsertLogging(const GraphDef& input_graph_def,
     const bool op_matches = (ops.count(node.op()) > 0);
     bool prefix_matches = false;
     for (const string& prefix : prefixes) {
-      if (str_util::StartsWith(node.name(), prefix)) {
+      if (absl::StartsWith(node.name(), prefix)) {
         prefix_matches = true;
       }
     }
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index bed51f89821..49e5cca461f 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
-using str_util::Join;
 using str_util::Split;
 using str_util::StringReplace;
 using strings::StrCat;
@@ -88,11 +87,11 @@ void CreateConstNode(const Tensor& tensor, const string& name,
 
 string GetMonolithicTensorKey(const string& tensor_slice_name) {
   std::vector<string> names = Split(tensor_slice_name, "/");
-  if (str_util::StartsWith(names[names.size() - 1], "part_")) {
+  if (absl::StartsWith(names[names.size() - 1], "part_")) {
     CHECK_GE(names.size(), 2);
     names.pop_back();
   }
-  return Join(names, "/");
+  return absl::StrJoin(names, "/");
 }
 
 Status ObtainTensorSlice(const GraphDef& input_graph_def,
@@ -102,8 +101,8 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
   for (const auto& node : input_graph_def.node()) {
     std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
-        str_util::StartsWith(node_name_parts[0], "save") &&
-        str_util::StartsWith(node_name_parts[1], "Assign") &&
+        absl::StartsWith(node_name_parts[0], "save") &&
+        absl::StartsWith(node_name_parts[1], "Assign") &&
         node.input(0) == target_name) {
       restore_node_name = node.input(1);
       break;
diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
index 8770405b232..2711d54f36c 100644
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@@ -94,7 +94,7 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*>& placeholders,
       for (int i = 0; i < shape.dims(); ++i) {
         sizes.push_back(shape.dim_size(i));
       }
-      sizes_string = str_util::Join(sizes, ",");
+      sizes_string = absl::StrJoin(sizes, ",");
     }
     input_layer_shapes.push_back(sizes_string);
   }
@@ -103,10 +103,10 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*>& placeholders,
   for (const NodeDef* node : outputs) {
     output_layers.push_back(node->name());
   }
-  string input_layer_value = str_util::Join(input_layers, ",");
-  string input_layer_type_value = str_util::Join(input_layer_types, ",");
-  string input_layer_shape_value = str_util::Join(input_layer_shapes, ":");
-  string output_layer_value = str_util::Join(output_layers, ",");
+  string input_layer_value = absl::StrJoin(input_layers, ",");
+  string input_layer_type_value = absl::StrJoin(input_layer_types, ",");
+  string input_layer_shape_value = absl::StrJoin(input_layer_shapes, ":");
+  string output_layer_value = absl::StrJoin(output_layers, ",");
 
   std::cout << "To use with tensorflow/tools/benchmark:benchmark_model try "
                "these arguments:"
@@ -126,7 +126,7 @@ Status PrintStructure(const GraphDef& graph) {
   TF_RETURN_IF_ERROR(SortByExecutionOrder(graph, &sorted_graph));
   for (const NodeDef& node : sorted_graph.node()) {
     std::cout << node.name() << " (" << node.op() << "): ["
-              << str_util::Join(node.input(), ", ") << "]";
+              << absl::StrJoin(node.input(), ", ") << "]";
     if (node.op() == "Const") {
       Tensor tensor;
       if (node.attr().count("value") &&
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index 7efe450710a..5b9fa84cc15 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -286,7 +286,7 @@ Status ShouldIgnoreErrors(const TransformFuncParameters& transform_params,
   if (transform_params.count("ignore_errors") &&
       (!transform_params.at("ignore_errors").empty())) {
     const string& ignore_errors_string =
-        str_util::Lowercase(transform_params.at("ignore_errors").at(0));
+        absl::AsciiStrToLower(transform_params.at("ignore_errors").at(0));
     if (ignore_errors_string == "true") {
       *ignore_errors = true;
     } else if (ignore_errors_string == "false") {
diff --git a/tensorflow/tools/graph_transforms/transform_graph_test.cc b/tensorflow/tools/graph_transforms/transform_graph_test.cc
index b276229aa44..d748fd6803c 100644
--- a/tensorflow/tools/graph_transforms/transform_graph_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph_test.cc
@@ -138,8 +138,7 @@ class TransformGraphTest : public ::testing::Test {
     TF_ASSERT_OK(root.ToGraphDef(&graph_def));
     Status no_such_status =
         TransformGraph({}, {}, {{"test_no_such_transform", {}}}, &graph_def);
-    EXPECT_TRUE(
-        str_util::StrContains(no_such_status.ToString(), "not recognized"));
+    EXPECT_TRUE(absl::StrContains(no_such_status.ToString(), "not recognized"));
   }
 
   void TestParseTransformParameters() {
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index c715380aaec..cd638da6b82 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -88,7 +88,7 @@ void NodeNamePartsFromInput(const string& input_name, string* prefix,
     *suffix = ":" + input_parts[1];
   }
   StringPiece node_name_piece(input_parts[0]);
-  if (str_util::ConsumePrefix(&node_name_piece, "^")) {
+  if (absl::ConsumePrefix(&node_name_piece, "^")) {
     *prefix = "^";
   } else {
     *prefix = "";
@@ -200,7 +200,7 @@ Status SortByExecutionOrder(const GraphDef& input_graph_def,
       // for merge only wait for one non-control input.
       int32 num_control_edges = 0;
       for (int i = 0; i < node_def.input_size(); ++i) {
-        if (str_util::StartsWith(node_def.input(i), "^")) {
+        if (absl::StartsWith(node_def.input(i), "^")) {
           num_control_edges++;
         }
       }
diff --git a/tensorflow/tools/mlpbtxt/BUILD b/tensorflow/tools/mlpbtxt/BUILD
index 89c683c8c42..f17d7511f42 100644
--- a/tensorflow/tools/mlpbtxt/BUILD
+++ b/tensorflow/tools/mlpbtxt/BUILD
@@ -2,9 +2,10 @@
 # This package provides binaries that convert between multi-line and standard
 # pbtxt (text-serialization of protocol message) files.
 
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
index aa6c850b0b3..d4e9857e155 100644
--- a/tensorflow/tools/optimization/BUILD
+++ b/tensorflow/tools/optimization/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   Utilities that perform useful transformations on graphs
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow:tensorflow.bzl",
diff --git a/tensorflow/tools/pip_package/check_load_py_test.py b/tensorflow/tools/pip_package/check_load_py_test.py
index e2fe1121d7f..33869172a58 100644
--- a/tensorflow/tools/pip_package/check_load_py_test.py
+++ b/tensorflow/tools/pip_package/check_load_py_test.py
@@ -35,7 +35,7 @@ def check_output_despite_error(args):
     output as string.
   """
   try:
-    output = subprocess.check_output(args, stderr=subprocess.STDOUT)
+    output = subprocess.check_output(args, shell=True, stderr=subprocess.STDOUT)
   except subprocess.CalledProcessError as e:
     output = e.output
   return output.strip()
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 60c0f423303..d41ece78692 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -8,9 +8,10 @@
 # Note that proto3 well-known types (e.g. Any) are not handled in a special way
 # by the generated code.
 
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files([
     "LICENSE",
@@ -20,8 +21,8 @@ exports_files([
 load(
     "//tensorflow:tensorflow.bzl",
     "if_ios",
-    "tf_generate_proto_text_sources",
     "tf_cc_test",
+    "tf_generate_proto_text_sources",
 )
 
 # For platform specific build config
diff --git a/tensorflow/tools/tensorflow_builder/compat_checker/BUILD b/tensorflow/tools/tensorflow_builder/compat_checker/BUILD
new file mode 100644
index 00000000000..b60a7df1b76
--- /dev/null
+++ b/tensorflow/tools/tensorflow_builder/compat_checker/BUILD
@@ -0,0 +1,43 @@
+# Description:
+#   Tensorflow builder compatibility checker.
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//tensorflow/tools/tensorflow_builder:__subpackages__",
+    ],
+)
+
+py_library(
+    name = "compat_checker",
+    srcs = ["compat_checker.py"],
+    deps = [
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "compat_checker_test",
+    srcs = ["compat_checker_test.py"],
+    data = [
+        "//tensorflow/tools/tensorflow_builder/compat_checker:test_config",
+    ],
+    tags = ["no_pip"],
+    deps = [
+        ":compat_checker",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "test_config",
+    srcs = ["test_config.ini"],
+)
diff --git a/tensorflow/tools/tensorflow_builder/compat_checker/compat_checker.py b/tensorflow/tools/tensorflow_builder/compat_checker/compat_checker.py
new file mode 100644
index 00000000000..d65d7727ffa
--- /dev/null
+++ b/tensorflow/tools/tensorflow_builder/compat_checker/compat_checker.py
@@ -0,0 +1,911 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+"""Checks if a set of configuration(s) is version and dependency compatible."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import sys
+
+import six
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
+# pylint: disable=g-import-not-at-top
+if six.PY2:
+  import ConfigParser
+else:
+  import configparser as ConfigParser
+# pylint: enable=g-import-not-at-top
+
+PATH_TO_DIR = "tensorflow/tools/tensorflow_builder/compat_checker"
+
+
+def _compare_versions(v1, v2):
+  """Compare two versions and return information on which is smaller vs. larger.
+
+  Args:
+    v1: String that is a version to be compared against `v2`.
+    v2: String that is a version to be compared against `v1`.
+
+  Returns:
+    Dict that stores larger version with key `larger` and smaller version with
+      key `smaller`.
+      e.g. {`larger`: `1.5.0`, `smaller`: `1.2.0`}
+
+  Raises:
+    RuntimeError: If asked to compare `inf` to `inf`.
+  """
+  # Throw error is asked to compare `inf` to `inf`.
+  if v1 == "inf" and v2 == "inf":
+    raise RuntimeError("Cannot compare `inf` to `inf`.")
+
+  rtn_dict = {"smaller": None, "larger": None}
+  v1_list = v1.split(".")
+  v2_list = v2.split(".")
+  # Take care of cases with infinity (arg=`inf`).
+  if v1_list[0] == "inf":
+    v1_list[0] = str(int(v2_list[0]) + 1)
+  if v2_list[0] == "inf":
+    v2_list[0] = str(int(v1_list[0]) + 1)
+
+  # Determine which of the two lists are longer vs. shorter.
+  v_long = v1_list if len(v1_list) >= len(v2_list) else v2_list
+  v_short = v1_list if len(v1_list) < len(v2_list) else v2_list
+
+  larger, smaller = None, None
+  for i, ver in enumerate(v_short, start=0):
+    if int(ver) > int(v_long[i]):
+      larger = _list_to_string(v_short, ".")
+      smaller = _list_to_string(v_long, ".")
+    elif int(ver) < int(v_long[i]):
+      larger = _list_to_string(v_long, ".")
+      smaller = _list_to_string(v_short, ".")
+    else:
+      if i == len(v_short) - 1:
+        if v_long[i + 1:] == ["0"]*(len(v_long) - 1 - i):
+          larger = "equal"
+          smaller = "equal"
+        else:
+          larger = _list_to_string(v_long, ".")
+          smaller = _list_to_string(v_short, ".")
+      else:
+        # Go to next round.
+        pass
+
+    if larger:
+      break
+
+  rtn_dict["smaller"] = smaller
+  rtn_dict["larger"] = larger
+
+  return rtn_dict
+
+
+def _list_to_string(l, s):
+  """Concatenates list items into a single string separated by `s`.
+
+  Args:
+    l: List with items to be concatenated into a single string.
+    s: String or char that will be concatenated in between each item.
+
+  Returns:
+    String that has all items in list `l` concatenated with `s` separator.
+  """
+
+  return s.join(l)
+
+
+def _get_func_name():
+  """Get the name of current function.
+
+  Returns:
+    String that is the name of current function.
+  """
+  return tf_inspect.stack()[1][3]
+
+
+class ConfigCompatChecker(object):
+  """Class that checks configuration versions and depencency compatibilities.
+
+  `ConfigCompatChecker` checks a given set of configurations and their versions
+  against supported versions and dependency rules defined in `.ini` config file.
+  For project `TensorFlow Builder`, it functions as a sub-module for the builder
+  service that validates requested build configurations from a client prior to
+  initiating a TensorFlow build.
+  """
+
+  class _Reqs(object):
+    """Class that stores specifications related to a single requirement.
+
+    `_Reqs` represents a single version or dependency requirement specified in
+    the `.ini` config file. It is meant ot be used inside `ConfigCompatChecker`
+    to help organize and identify version and dependency compatibility for a
+    given configuration (e.g. gcc version) required by the client.
+    """
+
+    def __init__(self, req, config, section):
+      """Initializes a version or dependency requirement object.
+
+      Args:
+        req: List that contains individual supported versions or a single string
+             that contains `range` definition.
+               e.g. [`range(1.0, 2.0) include(3.0) exclude(1.5)`]
+               e.g. [`1.0`, `3.0`, `7.1`]
+        config: String that is the configuration name.
+                  e.g. `platform`
+        section: String that is the section name from the `.ini` config file
+                 under which the requirement is defined.
+                   e.g. `Required`, `Optional`, `Unsupported`, `Dependency`
+      """
+      # Req class variables.
+      self.req = req
+      self.exclude = None
+      self.include = None
+      self.range = [None, None]  # for [min, max]
+      self.config = config
+      self._req_type = ""  # e.g. `range` or `no_range`
+      self._section = section
+      self._initialized = None
+      self._error_message = []
+
+      # Parse and store requirement specifications.
+      self.parse_single_req()
+
+    @property
+    def get_status(self):
+      """Get status of `_Reqs` initialization.
+
+      Returns:
+        Tuple
+          (Boolean indicating initialization status,
+           List of error messages, if any)
+
+      """
+
+      return self._initialized, self._error_message
+
+    def __str__(self):
+      """Prints a requirement and its components.
+
+      Returns:
+        String that has concantenated information about a requirement.
+      """
+      info = {
+          "section": self._section,
+          "config": self.config,
+          "req_type": self._req_type,
+          "req": str(self.req),
+          "range": str(self.range),
+          "exclude": str(self.exclude),
+          "include": str(self.include),
+          "init": str(self._initialized)
+      }
+      req_str = "\n >>> _Reqs Instance <<<\n"
+      req_str += "Section: {section}\n"
+      req_str += "Configuration name: {config}\n"
+      req_str += "Requirement type: {req_type}\n"
+      req_str += "Requirement: {req}\n"
+      req_str += "Range: {range}\n"
+      req_str += "Exclude: {exclude}\n"
+      req_str += "Include: {include}\n"
+      req_str += "Initilalized: {init}\n\n"
+
+      return req_str.format(**info)
+
+    def parse_single_req(self):
+      """Parses a requirement and stores information.
+
+      `self.req` _initialized in `__init__` is called for retrieving the
+      requirement.
+
+      A requirement can come in two forms:
+        [1] String that includes `range` indicating range syntax for defining
+            a requirement.
+              e.g. `range(1.0, 2.0) include(3.0) exclude(1.5)`
+        [2] List that includes inidividual supported versions or items.
+              e.g. [`1.0`, `3.0`, `7.1`]
+
+      For a list type requirement, it directly stores the list to
+      `self.include`.
+
+      Call `get_status` for checking the status of the parsing. This function
+      sets `self._initialized` to `False` and immediately returns with an error
+      message upon encountering a failure. It sets `self._initialized` to `True`
+      and returns without an error message upon success.
+      """
+      # Regex expression for filtering requirement line. Please refer
+      # to docstring above for more information.
+      expr = r"(range\()?([\d\.\,\s]+)(\))?( )?(include\()?"
+      expr += r"([\d\.\,\s]+)?(\))?( )?(exclude\()?([\d\.\,\s]+)?(\))?"
+
+      # Check that arg `req` is not empty.
+      if not self.req:
+        err_msg = "[Error] Requirement is missing. "
+        err_msg += "(section = %s, " % str(self._section)
+        err_msg += "config = %s, req = %s)" % (str(self.config), str(self.req))
+        logging.error(err_msg)
+        self._initialized = False
+        self._error_message.append(err_msg)
+
+        return
+
+      # For requirement given in format with `range`. For example:
+      # python = [range(3.3, 3.7) include(2.7)] as opposed to
+      # python = [2.7, 3.3, 3.4, 3.5, 3.6, 3.7]
+      if "range" in self.req[0]:
+        self._req_type = "range"
+        match = re.match(expr, self.req[0])
+        if not match:
+          err_msg = "[Error] Encountered issue when parsing the requirement."
+          err_msg += " (req = %s, match = %s)" % (str(self.req), str(match))
+          logging.error(err_msg)
+          self._initialized = False
+          self._error_message.append(err_msg)
+
+          return
+        else:
+          match_grp = match.groups()
+          match_size = len(match_grp)
+          for i, m in enumerate(match_grp[0:match_size-1], start=0):
+            # Get next index. For example:
+            # |    idx     |  next_idx  |
+            # +------------+------------+
+            # |  `range(`  | `1.1, 1.5` |
+            # | `exclude(` | `1.1, 1.5` |
+            # | `include(` | `1.1, 1.5` |
+            next_match = match_grp[i + 1]
+
+            if m not in ["", None, " ", ")"]:
+              if "range" in m:
+                # Check that the range definition contains only one comma.
+                # If more than one comma, then there is format error with the
+                # requirement config file.
+                comma_count = next_match.count(",")
+                if comma_count > 1 or comma_count == 0:
+                  err_msg = "[Error] Found zero or more than one comma in range"
+                  err_msg += " definition. (req = %s, " % str(self.req)
+                  err_msg += "match = %s)" % str(next_match)
+                  logging.error(err_msg)
+                  self._initialized = False
+                  self._error_message.append(err_msg)
+
+                  return
+
+                # Remove empty space in range and separate min, max by
+                # comma. (e.g. `1.0, 2.0` => `1.0,2.0` => [`1.0`, `2.0`])
+                min_max = next_match.replace(" ", "").split(",")
+
+                # Explicitly define min and max values.
+                # If min_max = ['', ''], then `range(, )` was provided as
+                # req, which is equivalent to `include all versions`.
+                if not min_max[0]:
+                  min_max[0] = "0"
+
+                if not min_max[1]:
+                  min_max[1] = "inf"
+
+                self.range = min_max
+              if "exclude" in m:
+                self.exclude = next_match.replace(" ", "").split(",")
+
+              if "include" in m:
+                self.include = next_match.replace(" ", "").split(",")
+
+              self._initialized = True
+
+      # For requirement given in format without a `range`. For example:
+      # python = [2.7, 3.3, 3.4, 3.5, 3.6, 3.7] as opposed to
+      # python = [range(3.3, 3.7) include(2.7)]
+      else:
+        self._req_type = "no_range"
+        # Requirement (self.req) should be a list.
+        if not isinstance(self.req, list):
+          err_msg = "[Error] Requirement is not a list."
+          err_msg += "(req = %s, " % str(self.req)
+          err_msg += "type(req) = %s)" % str(type(self.req))
+          logging.error(err_msg)
+          self._initialized = False
+          self._error_message.append(err_msg)
+        else:
+          self.include = self.req
+          self._initialized = True
+
+      return
+
+  def __init__(self, usr_config, req_file):
+    """Initializes a configuration compatibility checker.
+
+    Args:
+      usr_config: Dict of all configuration(s) whose version compatibilities are
+                  to be checked against the rules defined in the `.ini` config
+                  file.
+      req_file: String that is the full name of the `.ini` config file.
+                  e.g. `config.ini`
+    """
+    # ConfigCompatChecker class variables.
+    self.usr_config = usr_config
+    self.req_file = req_file
+    self.warning_msg = []
+    self.error_msg = []
+    # Get and store requirements.
+    reqs_all = self.get_all_reqs()
+    self.required = reqs_all["required"]
+    self.optional = reqs_all["optional"]
+    self.unsupported = reqs_all["unsupported"]
+    self.dependency = reqs_all["dependency"]
+
+    self.successes = []
+    self.failures = []
+
+  def get_all_reqs(self):
+    """Parses all compatibility specifications listed in the `.ini` config file.
+
+    Reads and parses each and all compatibility specifications from the `.ini`
+    config file by sections. It then populates appropriate dicts that represent
+    each section (e.g. `self.required`) and returns a tuple of the populated
+    dicts.
+
+    Returns:
+      Dict of dict
+        { `required`: Dict of `Required` configs and supported versions,
+          `optional`: Dict of `Optional` configs and supported versions,
+          `unsupported`: Dict of `Unsupported` configs and supported versions,
+          `dependency`: Dict of `Dependency` configs and supported versions }
+    """
+    # First check if file exists. Exit on failure.
+    try:
+      open(self.req_file, "rb")
+    except IOError:
+      msg = "[Error] Cannot read file '%s'." % self.req_file
+      logging.error(msg)
+      sys.exit(1)
+
+    # Store status of parsing requirements. For local usage only.
+    curr_status = True
+
+    # Initialize config parser for parsing version requirements file.
+    parser = ConfigParser.ConfigParser()
+    parser.read(self.req_file)
+
+    if not parser.sections():
+      err_msg = "[Error] Empty confie file. "
+      err_msg += "(file = %s, " % str(self.req_file)
+      err_msg += "parser sectons = %s)" % str(parser.sections())
+      self.error_msg.append(err_msg)
+      logging.error(err_msg)
+      curr_status = False
+
+    # Each dependency dict will have the following format.
+    # _dict = {
+    #   `<config_name>` : [_Reqs()],
+    #   `<config_name>` : [_Reqs()]
+    # }
+    required_dict = {}
+    optional_dict = {}
+    unsupported_dict = {}
+    dependency_dict = {}
+
+    # Parse every config under each section defined in config file
+    # and populate requirement dict(s).
+    for section in parser.sections():
+      all_configs = parser.options(section)
+      for config in all_configs:
+        spec = parser.get(section, config)
+        # Separately manage each section:
+        #   `Required`,
+        #   `Optional`,
+        #   `Unsupported`,
+        #   `Dependency`
+        # One of the sections is required.
+        if section == "Dependency":
+          dependency_dict[config] = []
+          spec_split = spec.split(",\n")
+          # First dependency item may only or not have `[` depending
+          # on the indentation style in the config (.ini) file.
+          # If it has `[`, then either skip or remove from string.
+          if spec_split[0] == "[":
+            spec_split = spec_split[1:]
+          elif "[" in spec_split[0]:
+            spec_split[0] = spec_split[0].replace("[", "")
+          else:
+            warn_msg = "[Warning] Config file format error: Missing `[`."
+            warn_msg += "(section = %s, " % str(section)
+            warn_msg += "config = %s)" % str(config)
+            logging.warning(warn_msg)
+            self.warning_msg.append(warn_msg)
+
+          # Last dependency item may only or not have `]` depending
+          # on the identation style in the config (.ini) file.
+          # If it has `[`, then either skip or remove from string.
+          if spec_split[-1] == "]":
+            spec_split = spec_split[:-1]
+          elif "]" in spec_split[-1]:
+            spec_split[-1] = spec_split[-1].replace("]", "")
+          else:
+            warn_msg = "[Warning] Config file format error: Missing `]`."
+            warn_msg += "(section = %s, " % str(section)
+            warn_msg += "config = %s)" % str(config)
+            logging.warning(warn_msg)
+            self.warning_msg.append(warn_msg)
+
+          # Parse `spec_split` which is a list of all dependency rules
+          # retrieved from the config file.
+          # Create a _Reqs() instance for each rule and store it under
+          # appropriate class dict (e.g. dependency_dict) with a proper
+          # key.
+          #
+          # For dependency definition, it creates one _Reqs() instance each
+          # for requirement and dependency. For example, it would create
+          # a list in the following indexing sequence:
+          #
+          # [`config', <`config` _Reqs()>, `dep', <`dep` _Reqs()>]
+          #
+          # For example:
+          # [`python`, _Reqs(), `tensorflow`, _Reqs()] for
+          # `python 3.7 requires tensorflow 1.13`
+          for rule in spec_split:
+            # Filter out only the necessary information from `rule` string.
+            spec_dict = self.filter_dependency(rule)
+            # Create _Reqs() instance for each rule.
+            cfg_name = spec_dict["cfg"]  # config name
+            dep_name = spec_dict["cfgd"]  # dependency name
+            cfg_req = self._Reqs(
+                self.convert_to_list(spec_dict["cfg_spec"], " "),
+                config=cfg_name,
+                section=section
+            )
+            dep_req = self._Reqs(
+                self.convert_to_list(spec_dict["cfgd_spec"], " "),
+                config=dep_name,
+                section=section
+            )
+            # Check status of _Reqs() initialization. If wrong formats are
+            # detected from the config file, it would return `False` for
+            # initialization status.
+            # `<_Reqs>.get_status` returns [_initialized, _error_message]
+            cfg_req_status = cfg_req.get_status
+            dep_req_status = dep_req.get_status
+            if not cfg_req_status[0] or not dep_req_status[0]:
+              # `<_Reqs>.get_status()[1]` returns empty upon successful init.
+              msg = "[Error] Failed to create _Reqs() instance for a "
+              msg += "dependency item. (config = %s, " % str(cfg_name)
+              msg += "dep = %s)" % str(dep_name)
+              logging.error(msg)
+              self.error_msg.append(cfg_req_status[1])
+              self.error_msg.append(dep_req_status[1])
+              curr_status = False
+              break
+            else:
+              dependency_dict[config].append(
+                  [cfg_name, cfg_req, dep_name, dep_req])
+
+          # Break out of `if section == 'Dependency'` block.
+          if not curr_status:
+            break
+
+        else:
+          if section == "Required":
+            add_to = required_dict
+          elif section == "Optional":
+            add_to = optional_dict
+          elif section == "Unsupported":
+            add_to = unsupported_dict
+          else:
+            msg = "[Error] Section name `%s` is not accepted." % str(section)
+            msg += "Accepted section names are `Required`, `Optional`, "
+            msg += "`Unsupported`, and `Dependency`."
+            logging.error(msg)
+            self.error_msg.append(msg)
+            curr_status = False
+            break
+
+          # Need to make sure `req` argument for _Reqs() instance is always
+          # a list. If not, convert to list.
+          req_list = self.convert_to_list(self.filter_line(spec), " ")
+          add_to[config] = self._Reqs(req_list, config=config, section=section)
+        # Break out of `for config in all_configs` loop.
+        if not curr_status:
+          break
+
+      # Break out of `for section in parser.sections()` loop.
+      if not curr_status:
+        break
+
+    return_dict = {
+        "required": required_dict,
+        "optional": optional_dict,
+        "unsupported": unsupported_dict,
+        "dependency": dependency_dict
+    }
+
+    return return_dict
+
+  def filter_dependency(self, line):
+    """Filters dependency compatibility rules defined in the `.ini` config file.
+
+    Dependency specifications are defined as the following:
+      `<config> <config_version> requires <dependency> <dependency_version>`
+    e.g.
+      `python 3.7 requires tensorflow 1.13`
+      `tensorflow range(1.0.0, 1.13.1) requires gcc range(4.8, )`
+
+    Args:
+      line: String that is a dependency specification defined under `Dependency`
+            section in the `.ini` config file.
+
+    Returns:
+      Dict with configuration and its dependency information.
+        e.g. {`cfg`: `python`,       # configuration name
+              `cfg_spec`: `3.7`,     # configuration version
+              `cfgd`: `tensorflow`,  # dependency name
+              `cfgd_spec`: `4.8`}    # dependency version
+    """
+    line = line.strip("\n")
+    expr = r"(?P<cfg>[\S]+) (?P<cfg_spec>range\([\d\.\,\s]+\)( )?"
+    expr += r"(include\([\d\.\,\s]+\))?( )?(exclude\([\d\.\,\s]+\))?( )?"
+    expr += r"|[\d\,\.\s]+) requires (?P<cfgd>[\S]+) (?P<cfgd_spec>range"
+    expr += r"\([\d\.\,\s]+\)( )?(include\([\d\.\,\s]+\))?( )?"
+    expr += r"(exclude\([\d\.\,\s]+\))?( )?|[\d\,\.\s]+)"
+    r = re.match(expr, line.strip("\n"))
+
+    return r.groupdict()
+
+  def convert_to_list(self, item, separator):
+    """Converts a string into a list with a separator.
+
+    Args:
+      item: String that needs to be separated into a list by a given separator.
+            List item is also accepted but will take no effect.
+      separator: String with which the `item` will be splited.
+
+    Returns:
+      List that is a splited version of a given input string.
+        e.g. Input: `1.0, 2.0, 3.0` with `, ` separator
+             Output: [1.0, 2.0, 3.0]
+    """
+    out = None
+    if not isinstance(item, list):
+      if "range" in item:
+        # If arg `item` is a single string, then create a list with just
+        # the item.
+        out = [item]
+      else:
+        # arg `item` can come in as the following:
+        # `1.0, 1.1, 1.2, 1.4`
+        # if requirements were defined without the `range()` format.
+        # In such a case, create a list separated by `separator` which is
+        # an empty string (' ') in this case.
+        out = item.split(separator)
+        for i in range(len(out)):
+          out[i] = out[i].replace(",", "")
+
+    # arg `item` is a list already.
+    else:
+      out = [item]
+
+    return out
+
+  def filter_line(self, line):
+    """Removes `[` or `]` from the input line.
+
+    Args:
+      line: String that is a compatibility specification line from the `.ini`
+            config file.
+
+    Returns:
+      String that is a compatibility specification line without `[` and `]`.
+    """
+    filtered = []
+    warn_msg = []
+
+    splited = line.split("\n")
+
+    # If arg `line` is empty, then requirement might be missing. Add
+    # to warning as this issue will be caught in _Reqs() initialization.
+    if not line and len(splited) < 1:
+      warn_msg = "[Warning] Empty line detected while filtering lines."
+      logging.warning(warn_msg)
+      self.warning_msg.append(warn_msg)
+
+    # In general, first line in requirement definition will include `[`
+    # in the config file (.ini). Remove it.
+    if splited[0] == "[":
+      filtered = splited[1:]
+    elif "[" in splited[0]:
+      splited = splited[0].replace("[", "")
+      filtered = splited
+    # If `[` is missing, then it could be a formatting issue with
+    # config file (.ini.). Add to warning.
+    else:
+      warn_msg = "[Warning] Format error. `[` could be missing in "
+      warn_msg += "the config (.ini) file. (line = %s)" % str(line)
+      logging.warning(warn_msg)
+      self.warning_msg.append(warn_msg)
+
+    # In general, last line in requirement definition will include `]`
+    # in the config file (.ini). Remove it.
+    if filtered[-1] == "]":
+      filtered = filtered[:-1]
+    elif "]" in filtered[-1]:
+      filtered[-1] = filtered[-1].replace("]", "")
+    # If `]` is missing, then it could be a formatting issue with
+    # config file (.ini.). Add to warning.
+    else:
+      warn_msg = "[Warning] Format error. `]` could be missing in "
+      warn_msg += "the config (.ini) file. (line = %s)" % str(line)
+      logging.warning(warn_msg)
+      self.warning_msg.append(warn_msg)
+
+    return filtered
+
+  def in_range(self, ver, req):
+    """Checks if a version satisfies a version and/or compatibility requirement.
+
+    Args:
+      ver: List whose first item is a config version that needs to be checked
+           for support status and version compatibility.
+             e.g. ver = [`1.0`]
+      req: `_Reqs` class instance that represents a configuration version and
+            compatibility specifications.
+
+    Returns:
+      Boolean output of checking if version `ver` meets the requirement
+        stored in `req` (or a `_Reqs` requirements class instance).
+    """
+    # If `req.exclude` is not empty and `ver` is in `req.exclude`,
+    # no need to proceed to next set of checks as it is explicitly
+    # NOT supported.
+    if req.exclude is not None:
+      for v in ver:
+        if v in req.exclude:
+          return False
+
+    # If `req.include` is not empty and `ver` is in `req.include`,
+    # no need to proceed to next set of checks as it is supported and
+    # NOT unsupported (`req.exclude`).
+    include_checked = False
+    if req.include is not None:
+      for v in ver:
+        if v in req.include:
+          return True
+
+      include_checked = True
+
+    # If `req.range` is not empty, then `ver` is defined with a `range`
+    # syntax. Check whether `ver` falls under the defined supported
+    # range.
+    if req.range != [None, None]:
+      min_v = req.range[0]  # minimum supported version
+      max_v = req.range[1]  # maximum supported version
+      ver = ver[0]  # version to compare
+      lg = _compare_versions(min_v, ver)["larger"]  # `ver` should be larger
+      sm = _compare_versions(ver, max_v)["smaller"]  # `ver` should be smaller
+      if lg in [ver, "equal"] and sm in [ver, "equal", "inf"]:
+        return True
+      else:
+        err_msg = "[Error] Version is outside of supported range. "
+        err_msg += "(config = %s, " % str(req.config)
+        err_msg += "version = %s, " % str(ver)
+        err_msg += "supported range = %s)" % str(req.range)
+        logging.warning(err_msg)
+        self.warning_msg.append(err_msg)
+        return False
+
+    else:
+      err_msg = ""
+      if include_checked:
+        # user config is not supported as per exclude, include, range
+        # specification.
+        err_msg = "[Error] Version is outside of supported range. "
+      else:
+        # user config is not defined in exclude, include or range. config file
+        # error.
+        err_msg = "[Error] Missing specification. "
+
+      err_msg += "(config = %s, " % str(req.config)
+      err_msg += "version = %s, " % str(ver)
+      err_msg += "supported range = %s)" % str(req.range)
+      logging.warning(err_msg)
+      self.warning_msg.append(err_msg)
+      return False
+
+  def _print(self, *args):
+    """Prints compatibility check status and failure or warning messages.
+
+    Prints to console without using `logging`.
+
+    Args:
+      *args: String(s) that is one of:
+              [`failures`,       # all failures
+               `successes`,      # all successes
+               `failure_msgs`,   # failure message(s) recorded upon failure(s)
+               `warning_msgs`]   # warning message(s) recorded upon warning(s)
+    Raises:
+      Exception: If *args not in:
+                   [`failures`, `successes`, `failure_msgs`, `warning_msg`]
+    """
+
+    def _format(name, arr):
+      """Prints compatibility check results with a format.
+
+      Args:
+        name: String that is the title representing list `arr`.
+        arr: List of items to be printed in a certain format.
+      """
+      title = "### All Compatibility %s ###" % str(name)
+      tlen = len(title)
+      print("-"*tlen)
+      print(title)
+      print("-"*tlen)
+      print(" Total # of %s: %s\n" % (str(name), str(len(arr))))
+      if arr:
+        for item in arr:
+          detail = ""
+          if isinstance(item[1], list):
+            for itm in item[1]:
+              detail += str(itm) + ", "
+            detail = detail[:-2]
+          else:
+            detail = str(item[1])
+          print("  %s ('%s')\n" % (str(item[0]), detail))
+      else:
+        print("  No %s" % name)
+      print("\n")
+
+    for p_item in args:
+      if p_item == "failures":
+        _format("Failures", self.failures)
+      elif p_item == "successes":
+        _format("Successes", self.successes)
+      elif p_item == "failure_msgs":
+        _format("Failure Messages", self.error_msg)
+      elif p_item == "warning_msgs":
+        _format("Warning Messages", self.warning_msg)
+      else:
+        raise Exception(
+            "[Error] Wrong input provided for %s." % _get_func_name())
+
+  def check_compatibility(self):
+    """Checks version and dependency compatibility for a given configuration.
+
+    `check_compatibility` immediately returns with `False` (or failure status)
+    if any child process or checks fail. For error and warning messages, either
+    print `self.(error_msg|warning_msg)` or call `_print` function.
+
+    Returns:
+      Boolean that is a status of the compatibility check result.
+    """
+    # Check if all `Required` configs are found in user configs.
+    usr_keys = self.usr_config.keys()
+
+    for k in six.iterkeys(self.usr_config):
+      if k not in usr_keys:
+        err_msg = "[Error] Required config not found in user config."
+        err_msg += "(required = %s, " % str(k)
+        err_msg += "user configs = %s)" % str(usr_keys)
+        logging.error(err_msg)
+        self.error_msg.append(err_msg)
+        self.failures.append([k, err_msg])
+        return False
+
+    # Parse each user config and validate its compatibility.
+    overall_status = True
+    for config_name, spec in six.iteritems(self.usr_config):
+      temp_status = True
+      # Check under which section the user config is defined.
+      in_required = config_name in self.required.keys()
+      in_optional = config_name in self.optional.keys()
+      in_unsupported = config_name in self.unsupported.keys()
+      in_dependency = config_name in self.dependency.keys()
+
+      # Add to warning if user config is not specified in the config file.
+      if not (in_required or in_optional or in_unsupported or in_dependency):
+        warn_msg = "[Error] User config not defined in config file."
+        warn_msg += "(user config = %s)" % str(config_name)
+        logging.warning(warn_msg)
+        self.warning_msg.append(warn_msg)
+        self.failures.append([config_name, warn_msg])
+        temp_status = False
+      else:
+        if in_unsupported:
+          if self.in_range(spec, self.unsupported[config_name]):
+            err_msg = "[Error] User config is unsupported. It is "
+            err_msg += "defined under 'Unsupported' section in the config file."
+            err_msg += " (config = %s, spec = %s)" % (config_name, str(spec))
+            logging.error(err_msg)
+            self.error_msg.append(err_msg)
+            self.failures.append([config_name, err_msg])
+            temp_status = False
+
+        if in_required:
+          if not self.in_range(spec, self.required[config_name]):
+            err_msg = "[Error] User config cannot be supported. It is not in "
+            err_msg += "the supported range as defined in the 'Required' "
+            err_msg += "section. (config = %s, " % config_name
+            err_msg += "spec = %s)" % str(spec)
+            logging.error(err_msg)
+            self.error_msg.append(err_msg)
+            self.failures.append([config_name, err_msg])
+            temp_status = False
+
+        if in_optional:
+          if not self.in_range(spec, self.optional[config_name]):
+            err_msg = "[Error] User config cannot be supported. It is not in "
+            err_msg += "the supported range as defined in the 'Optional' "
+            err_msg += "section. (config = %s, " % config_name
+            err_msg += "spec = %s)" % str(spec)
+            logging.error(err_msg)
+            self.error_msg.append(err_msg)
+            self.failures.append([config_name, err_msg])
+            temp_status = False
+
+        # If user config and version has a dependency, check both user
+        # config + version and dependency config + version are supported.
+        if in_dependency:
+          # Get dependency information. The information gets retrieved in the
+          # following format:
+          #   [`config`, `config _Reqs()`, `dependency`, `dependency _Reqs()`]
+          dep_list = self.dependency[config_name]
+          if dep_list:
+            for rule in dep_list:
+              cfg = rule[0]  # config name
+              cfg_req = rule[1]  # _Reqs() instance for config requirement
+              dep = rule[2]  # dependency name
+              dep_req = rule[3]  # _Reqs() instance for dependency requirement
+
+              # Check if user config has a dependency in the following sequence:
+              #   [1] Check user config and the config that has dependency
+              #       are the same. (This is defined as `cfg_status`.)
+              #   [2] Check if dependency is supported.
+              try:
+                cfg_name = self.usr_config[cfg]
+                dep_name = self.usr_config[dep]
+
+                cfg_status = self.in_range(cfg_name, cfg_req)
+                dep_status = self.in_range(dep_name, dep_req)
+                # If both status's are `True`, then user config meets dependency
+                # spec.
+                if cfg_status:
+                  if not dep_status:
+                    # throw error
+                    err_msg = "[Error] User config has a dependency that cannot"
+                    err_msg += " be supported. "
+                    err_msg += "'%s' has a dependency on " % str(config_name)
+                    err_msg += "'%s'." % str(dep)
+                    logging.error(err_msg)
+                    self.error_msg.append(err_msg)
+                    self.failures.append([config_name, err_msg])
+                    temp_status = False
+
+              except KeyError:
+                err_msg = "[Error] Dependency is missing from `Required`. "
+                err_msg += "(config = %s, ""dep = %s)" % (cfg, dep)
+                logging.error(err_msg)
+                self.error_msg.append(err_msg)
+                self.failures.append([config_name, err_msg])
+                temp_status = False
+
+      # At this point, all requirement related to the user config has been
+      # checked and passed. Append to `successes` list.
+      if temp_status:
+        self.successes.append([config_name, spec])
+      else:
+        overall_status = False
+
+    return overall_status
diff --git a/tensorflow/tools/tensorflow_builder/compat_checker/compat_checker_test.py b/tensorflow/tools/tensorflow_builder/compat_checker/compat_checker_test.py
new file mode 100644
index 00000000000..88eae6c24fe
--- /dev/null
+++ b/tensorflow/tools/tensorflow_builder/compat_checker/compat_checker_test.py
@@ -0,0 +1,126 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+"""Tests for version compatibility checker for TensorFlow Builder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import unittest
+from tensorflow.tools.tensorflow_builder.compat_checker import compat_checker
+
+PATH_TO_DIR = "tensorflow/tools/tensorflow_builder/compat_checker"
+
+USER_CONFIG_IN_RANGE = {
+    "apple": ["1.0"],
+    "banana": ["3"],
+    "kiwi": ["2.0"],
+    "watermelon": ["2.0.0"],
+    "orange": ["4.1"],
+    "cherry": ["1.5"],
+    "cranberry": ["1.0"],
+    "raspberry": ["3.0"],
+    "tangerine": ["2.0.0"],
+    "jackfruit": ["1.0"],
+    "grapefruit": ["2.0"],
+    "apricot": ["wind", "flower"],
+    "grape": ["7.1"],
+    "blueberry": ["3.0"]
+}
+USER_CONFIG_NOT_IN_RANGE = {
+    "apple": ["4.0"],
+    "banana": ["5"],
+    "kiwi": ["3.5"],
+    "watermelon": ["5.0"],
+    "orange": ["3.5"],
+    "cherry": ["2.0"],
+    "raspberry": ["-1"],
+    "cranberry": ["4.5"],
+    "tangerine": ["0"],
+    "jackfruit": ["5.0"],
+    "grapefruit": ["2.5"],
+    "apricot": ["hello", "world"],
+    "blueberry": ["11.0"],
+    "grape": ["7.0"],
+    "cantaloupe": ["11.0"]
+}
+USER_CONFIG_MISSING = {
+    "avocado": ["3.0"],
+    "apple": [],
+    "banana": ""
+}
+
+
+class CompatCheckerTest(unittest.TestCase):
+
+  def setUp(self):
+    """Set up test."""
+    super(CompatCheckerTest, self).setUp()
+    self.test_file = os.path.join(PATH_TO_DIR, "test_config.ini")
+
+  def testDown(self):
+    """Tear down test."""
+    super(CompatCheckerTest, self).tearDown()
+
+  def testWithUserConfigInRange(self):
+    """Test a set of configs that are supported.
+
+    Testing with the following combination should always return `success`:
+      [1] A set of configurations that are supported and/or compatible.
+      [2] `.ini` config file with proper formatting.
+    """
+    # Initialize compatibility checker.
+    self.compat_checker = compat_checker.ConfigCompatChecker(
+        USER_CONFIG_IN_RANGE, self.test_file)
+    # Compatibility check should succeed.
+    self.assertTrue(self.compat_checker.check_compatibility())
+    # Make sure no warning or error messages are recorded.
+    self.assertFalse(len(self.compat_checker.error_msg))
+    # Make sure total # of successes match total # of configs.
+    cnt = len(USER_CONFIG_IN_RANGE.keys())
+    self.assertEqual(len(self.compat_checker.successes), cnt)
+
+  def testWithUserConfigNotInRange(self):
+    """Test a set of configs that are NOT supported.
+
+    Testing with the following combination should always return `failure`:
+      [1] A set of configurations that are NOT supported and/or compatible.
+      [2] `.ini` config file with proper formatting.
+    """
+    self.compat_checker = compat_checker.ConfigCompatChecker(
+        USER_CONFIG_NOT_IN_RANGE, self.test_file)
+    # Compatibility check should fail.
+    self.assertFalse(self.compat_checker.check_compatibility())
+    # Check error and warning messages.
+    err_msg_list = self.compat_checker.failures
+    self.assertTrue(len(err_msg_list))
+    # Make sure total # of failures match total # of configs.
+    cnt = len(USER_CONFIG_NOT_IN_RANGE.keys())
+    self.assertEqual(len(err_msg_list), cnt)
+
+  def testWithUserConfigMissing(self):
+    """Test a set of configs that are empty or missing specification."""
+    self.compat_checker = compat_checker.ConfigCompatChecker(
+        USER_CONFIG_MISSING, self.test_file)
+    # With missing specification in config file, the check should
+    # always fail.
+    self.assertFalse(self.compat_checker.check_compatibility())
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/tensorflow/tools/tensorflow_builder/compat_checker/sample_config.ini b/tensorflow/tools/tensorflow_builder/compat_checker/sample_config.ini
new file mode 100644
index 00000000000..3618b86741d
--- /dev/null
+++ b/tensorflow/tools/tensorflow_builder/compat_checker/sample_config.ini
@@ -0,0 +1,26 @@
+[Required]
+platform = [linux]
+uarch = [x86_64, x86]
+cuda = [9.0, 10.0, 10.1]
+cudnn = [range(7.0.0, 8.0.0)]
+gcc = [range(4.8, )]
+python = [range(3.3, 3.7) include(2.7)]
+glibc = [range(2.24, )]
+libstdcpp = [3.4.25]
+tensorflow = [range(1.0.0, 1.13.1)]
+tensorflow_gpu = [range(1.0.0, 1.13.1)]
+
+[Optional]
+isa = [avx, avx2, avx512f, sse4, sse4_2]
+
+[Unsupported]
+uarch = [i386]
+platform = [macos, windows]
+
+[Dependency]
+python = [
+	python 3.7 requires tensorflow 1.13]
+tensorflow = [
+	tensorflow range(1.0.0, 1.13.1) requires gcc range(4.8, )]
+tensorflow_gpu = [
+	tensorflow_gpu range(1.0.0, 1.13.1) requires gcc range(4.8, )]
diff --git a/tensorflow/tools/tensorflow_builder/compat_checker/test_config.ini b/tensorflow/tools/tensorflow_builder/compat_checker/test_config.ini
new file mode 100644
index 00000000000..60de152db13
--- /dev/null
+++ b/tensorflow/tools/tensorflow_builder/compat_checker/test_config.ini
@@ -0,0 +1,33 @@
+[Required]
+apple = [1.0, 2.0, 3.0]
+banana = [1, 2, 3]
+kiwi = [1.0.0, 2.0, 3]
+watermelon = [range(1.0, 3.0)]
+orange = [range(1.0, 3.0) include(4.0, 4.1, 5.0)]
+cherry = [range(1.0, 3.0) include(4.0) exclude(2.0)]
+raspberry = [range(,)]
+cranberry = [range(1.0,) exclude(3.0, 4.0, 4.5)]
+tangerine = [range(1.0, )]
+jackfruit = [range(,3.0)]
+grapefruit = [range( ,3.0) include(4.0) exclude(2.5)]
+apricot = [wind, water, flower, light, air]
+blueberry = [range(1.0, 10.0)]
+
+[Optional]
+mango = [range(1.0, 3.0) include(6.0)]
+
+[Unsupported]
+grape = [7.0, 8.0]
+cantaloupe = [range(10.0, 15.0) include(2.0) exclude(11.0)]
+
+[Dependency]
+apple = [
+  apple 2.0 requires banana 3]
+orange = [orange 4.1 requires banana 3]
+watermelon = [
+  watermelon 2.0 requires kiwi 2.0,
+  watermelon 1.0 requires kiwi 1.0.0]
+banana = [
+  banana 2 requires cherry 1.5]
+cherry = [cherry range(1.0, 2.0) include(2.2) exclude(1.2) requires raspberry 3.0]
+jackfruit = [jackfruit 2.0 requires grapefruit range(1.0, 2.0) include(2.2) exclude(1.2)]
diff --git a/tensorflow/tools/tensorflow_builder/data/golden/BUILD b/tensorflow/tools/tensorflow_builder/data/golden/BUILD
index 1980dc882ce..122fbeef442 100644
--- a/tensorflow/tools/tensorflow_builder/data/golden/BUILD
+++ b/tensorflow/tools/tensorflow_builder/data/golden/BUILD
@@ -1,11 +1,10 @@
 # TODO(hyey): describe this package.
 
-licenses(["notice"])  # Apache 2.0
-
 package(
     default_visibility = [
         "//tensorflow/tools/tensorflow_builder:__subpackages__",
     ],
+    licenses = ["notice"],  # Apache 2.0
 )
 
 filegroup(
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 0d26c0198ef..191d6c4130b 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -1,7 +1,10 @@
 # Description:
 # Tools for testing
 
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 load(
     "//tensorflow/tools/test:performance.bzl",
@@ -10,8 +13,6 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-licenses(["notice"])  # Apache 2.0
-
 exports_files(["LICENSE"])
 
 py_library(
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index ae5043ac2d7..7a69d89195d 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -137,11 +137,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "7433bedd6b665ce46073768964c5b1276cf163887a3b87f507409f994b62beb6",
-        strip_prefix = "abseil-cpp-fa00c321073c7ea40a4fc3dfc8a06309eae3d025",
+        sha256 = "917890a17089ce26bb2762613059f87ae051e3bade169f5d0fe78fed0b3d9696",
+        strip_prefix = "abseil-cpp-27c30ec671cb7b5ba84c4e79feff7fd0b0ac6338",
         urls = [
-            "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/fa00c321073c7ea40a4fc3dfc8a06309eae3d025.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/fa00c321073c7ea40a4fc3dfc8a06309eae3d025.tar.gz",
+            "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/27c30ec671cb7b5ba84c4e79feff7fd0b0ac6338.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/27c30ec671cb7b5ba84c4e79feff7fd0b0ac6338.tar.gz",
         ],
     )
 
@@ -149,11 +149,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "74845ea27e19a1bcf63f3f271de62e06798f23e0467bb9d45b83a94918941b23",
-        strip_prefix = "eigen-eigen-20cbc6576426",
+        sha256 = "0dde8fb87f5dad2e409c9f4ea1bebc54e694cf4f3b633081b0d51a55c00f9c9f",
+        strip_prefix = "eigen-eigen-a0d250e79c79",
         urls = [
-            "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/20cbc6576426.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/20cbc6576426.tar.gz",
+            "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/a0d250e79c79.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/a0d250e79c79.tar.gz",
         ],
     )
 
@@ -241,12 +241,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "png_archive",
         build_file = clean_dep("//third_party:png.BUILD"),
         patch_file = clean_dep("//third_party:png_fix_rpi.patch"),
-        sha256 = "6d59d6a154ccbb772ec11772cb8f8beb0d382b61e7ccc62435bf7311c9f4b210",
-        strip_prefix = "libpng-1.6.35",
+        sha256 = "ca74a0dace179a8422187671aee97dd3892b53e168627145271cad5b5ac81307",
+        strip_prefix = "libpng-1.6.37",
         system_build_file = clean_dep("//third_party/systemlibs:png.BUILD"),
         urls = [
-            "http://mirror.tensorflow.org/github.com/glennrp/libpng/archive/v1.6.35.tar.gz",
-            "https://github.com/glennrp/libpng/archive/v1.6.35.tar.gz",
+            "http://mirror.tensorflow.org/github.com/glennrp/libpng/archive/v1.6.37.tar.gz",
+            "https://github.com/glennrp/libpng/archive/v1.6.37.tar.gz",
         ],
     )
 
@@ -522,11 +522,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "35273de8def85a0acd0217dbf8f920c803fe992111b99a00c1cde2cfe5fc42ad",
-        strip_prefix = "llvm-2b69712b6e7e9160f12991730bcc864deb3594a0",
+        sha256 = "5178d83e043a71bfec9dcf976b9d6ea7f730da99df2eb6282286d76b8ce08a30",
+        strip_prefix = "llvm-0537f4f92ef7204376bd65bb7c32657832c5e4aa",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/2b69712b6e7e9160f12991730bcc864deb3594a0.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/2b69712b6e7e9160f12991730bcc864deb3594a0.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/0537f4f92ef7204376bd65bb7c32657832c5e4aa.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/0537f4f92ef7204376bd65bb7c32657832c5e4aa.tar.gz",
         ],
     )
 
@@ -716,11 +716,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "rocprim_archive",
         build_file = clean_dep("//third_party:rocprim.BUILD"),
-        sha256 = "12adf5bf3641d73c92915f102b17951f978704551fdcb9ed7f6311ed299b1d80",
-        strip_prefix = "rocPRIM-eff7d0687baf57db2507a31663a3dea72eed9093",
+        sha256 = "3c178461ead70ce6adb60c836a35a52564968af31dfa81f4157ab72b5f14d31f",
+        strip_prefix = "rocPRIM-4a33d328f8352df1654271939da66914f2334424",
         urls = [
-            "https://mirror.bazel.build/github.com/ROCmSoftwarePlatform/rocPRIM/archive/eff7d0687baf57db2507a31663a3dea72eed9093.tar.gz",
-            "https://github.com/ROCmSoftwarePlatform/rocPRIM/archive/eff7d0687baf57db2507a31663a3dea72eed9093.tar.gz",
+            "https://mirror.bazel.build/github.com/ROCmSoftwarePlatform/rocPRIM/archive/4a33d328f8352df1654271939da66914f2334424.tar.gz",
+            "https://github.com/ROCmSoftwarePlatform/rocPRIM/archive/4a33d328f8352df1654271939da66914f2334424.tar.gz",
         ],
     )
 
diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index 26ff2282fd0..7c3f650f317 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -63,6 +63,11 @@ cc_library(
         "-DAWS_SDK_VERSION_PATCH=8",
     ],
     defines = select({
+        "@org_tensorflow//tensorflow:linux_aarch64": [
+            "PLATFORM_LINUX",
+            "ENABLE_CURL_CLIENT",
+            "ENABLE_NO_ENCRYPTION",
+        ],
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index f7575bbe28e..282ba08cda5 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -280,6 +280,7 @@ def _impl(ctx):
 
     preprocessor_defines_feature = feature(
         name = "preprocessor_defines",
+        enabled = True,
         flag_sets = [
             flag_set(
                 actions = [
diff --git a/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
index 98455cf4b1e..d80081f3b5a 100644
--- a/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
@@ -1,19 +1,112 @@
-"""cc_toolchain_config rule for configuring rocm toolchain."""
+"""cc_toolchain_config rule for configuring ROCm toolchain on Linux."""
 
-load("@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl", "tool_path")
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+)
+load(
+    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
+    "ASSEMBLE_ACTION_NAME",
+    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
+    "CLIF_MATCH_ACTION_NAME",
+    "CPP_COMPILE_ACTION_NAME",
+    "CPP_HEADER_PARSING_ACTION_NAME",
+    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_EXECUTABLE_ACTION_NAME",
+    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
+    "CPP_MODULE_CODEGEN_ACTION_NAME",
+    "CPP_MODULE_COMPILE_ACTION_NAME",
+    "C_COMPILE_ACTION_NAME",
+    "LINKSTAMP_COMPILE_ACTION_NAME",
+    "LTO_BACKEND_ACTION_NAME",
+    "LTO_INDEXING_ACTION_NAME",
+    "OBJCPP_COMPILE_ACTION_NAME",
+    "OBJCPP_EXECUTABLE_ACTION_NAME",
+    "OBJC_ARCHIVE_ACTION_NAME",
+    "OBJC_COMPILE_ACTION_NAME",
+    "OBJC_EXECUTABLE_ACTION_NAME",
+    "OBJC_FULLY_LINK_ACTION_NAME",
+    "PREPROCESS_ASSEMBLE_ACTION_NAME",
+    "STRIP_ACTION_NAME",
+)
+
+ACTION_NAMES = struct(
+    c_compile = C_COMPILE_ACTION_NAME,
+    cpp_compile = CPP_COMPILE_ACTION_NAME,
+    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
+    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
+    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
+    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
+    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
+    assemble = ASSEMBLE_ACTION_NAME,
+    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
+    lto_indexing = LTO_INDEXING_ACTION_NAME,
+    lto_backend = LTO_BACKEND_ACTION_NAME,
+    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
+    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
+    strip = STRIP_ACTION_NAME,
+    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
+    objc_compile = OBJC_COMPILE_ACTION_NAME,
+    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
+    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
+    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
+    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
+    clif_match = CLIF_MATCH_ACTION_NAME,
+    objcopy_embed_data = "objcopy_embed_data",
+    ld_embed_data = "ld_embed_data",
+)
 
 def _impl(ctx):
-    toolchain_identifier = "local_linux"
+    if (ctx.attr.cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+    elif (ctx.attr.cpu == "local"):
+        toolchain_identifier = "local_linux"
+    elif (ctx.attr.cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+    else:
+        fail("Unreachable")
 
     host_system_name = "local"
 
     target_system_name = "local"
 
-    target_cpu = "local"
+    if (ctx.attr.cpu == "darwin"):
+        target_cpu = "darwin"
+    elif (ctx.attr.cpu == "local"):
+        target_cpu = "local"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_cpu = "x64_windows"
+    else:
+        fail("Unreachable")
 
-    target_libc = "local"
+    if (ctx.attr.cpu == "local"):
+        target_libc = "local"
+    elif (ctx.attr.cpu == "darwin"):
+        target_libc = "macosx"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_libc = "msvcrt"
+    else:
+        fail("Unreachable")
 
-    compiler = "compiler"
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        compiler = "compiler"
+    elif (ctx.attr.cpu == "x64_windows"):
+        compiler = "msvc-cl"
+    else:
+        fail("Unreachable")
 
     abi_version = "local"
 
@@ -23,33 +116,1345 @@ def _impl(ctx):
 
     builtin_sysroot = None
 
-    action_configs = []
-
-    features = []
-
-    cxx_builtin_include_directories = ctx.attr.host_compiler_includes
-
-    artifact_name_patterns = []
-
-    make_variables = []
-
-    tool_paths = [
-        tool_path(name = "ar", path = "/usr/bin/ar"),
-        tool_path(name = "compat-ld", path = "/usr/bin/ld"),
-        tool_path(name = "cpp", path = "/usr/bin/cpp"),
-        tool_path(name = "dwp", path = "/usr/bin/dwp"),
-        tool_path(
-            name = "gcc",
-            path = "clang/bin/crosstool_wrapper_driver_rocm",
-        ),
-        tool_path(name = "gcov", path = "/usr/bin/gcov"),
-        tool_path(name = "ld", path = "/usr/bin/ld"),
-        tool_path(name = "nm", path = "/usr/bin/nm"),
-        tool_path(name = "objcopy", path = "/usr/bin/objcopy"),
-        tool_path(name = "objdump", path = "/usr/bin/objdump"),
-        tool_path(name = "strip", path = "/usr/bin/strip"),
+    all_link_actions = [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
     ]
 
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = ctx.attr.msvc_lib_path)],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        action_configs = []
+    elif (ctx.attr.cpu == "x64_windows"):
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        fail("Unreachable")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    pic_feature = feature(
+        name = "pic",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
+                    #
+                    # Commented out in CROSSTOOL_hipcc.tpl
+                    #
+                    # flag_group(
+                    #     flags = ["-fPIE"],
+                    #     expand_if_not_available = "pic",
+                    # ),
+                ],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = ([
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ctx.attr.host_unfiltered_compile_flags,
+                    ),
+                ],
+            ),
+        ] if ctx.attr.host_unfiltered_compile_flags else []),
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-Wno-builtin-macro-redefined",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    rocm_link_flags_feature = feature(
+        name = "rocm_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["-pass-exit-codes"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                #
+                # Commented out in CROSSTOOL_hipcc.tpl
+                #
+                # flag_set(
+                #     actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                #     flag_groups = [
+                #         flag_group(
+                #             flags = [
+                #                 "-U_FORTIFY_SOURCE",
+                #                 "-D_FORTIFY_SOURCE=1",
+                #                 "-fstack-protector",
+                #             ],
+                #         ),
+                #     ],
+                # ),
+                # flag_set(
+                #     actions = [
+                #         ACTION_NAMES.cpp_link_dynamic_library,
+                #         ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                #     ],
+                #     flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
+                # ),
+                # flag_set(
+                #     actions = [ACTION_NAMES.cpp_link_executable],
+                #     flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
+                # ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie"])],
+                ),
+            ],
+        )
+    else:
+        hardening_feature = None
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                    env_entry(
+                        key = "INCLUDE",
+                        value = ctx.attr.msvc_env_include,
+                    ),
+                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                ],
+            ),
+        ],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    warnings_feature = feature(
+        name = "warnings",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0600",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/J",
+                            "/Gy",
+                            "/GF",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g"])],
+                ),
+            ],
+            implies = ["common"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    else:
+        dbg_feature = None
+
+    undefined_dynamic_feature = feature(
+        name = "undefined-dynamic",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+            ),
+        ],
+    )
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable-assertions",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "x64_windows"):
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    elif (ctx.attr.cpu == "darwin" or
+          ctx.attr.cpu == "local"):
+        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
+    else:
+        fastbuild_feature = None
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    redirector_feature = feature(
+        name = "redirector",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-B",
+                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linker_bin_path_feature = feature(
+        name = "linker-bin-path",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+                #
+                # commented out in CROSSTOOL_hipcc.tpl
+                #
+                # flag_set(
+                #     actions = [
+                #         ACTION_NAMES.cpp_link_dynamic_library,
+                #         ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                #         ACTION_NAMES.cpp_link_executable,
+                #     ],
+                #     flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                # ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                ),
+            ],
+        )
+    else:
+        opt_feature = None
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    frame_pointer_feature = feature(
+        name = "frame-pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
+            ),
+        ],
+    )
+
+    build_id_feature = feature(
+        name = "build-id",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "darwin"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lc++"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "local"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+    else:
+        stdlib_feature = None
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    alwayslink_feature = feature(
+        name = "alwayslink",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
+            ),
+        ],
+    )
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-no-canonical-prefixes",
+                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
+                ),
+            ],
+        )
+    else:
+        no_canonical_prefixes_feature = None
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    cpp11_feature = feature(
+        name = "c++11",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-std=c++11"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "alwayslink",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "build-id",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+                "undefined-dynamic",
+            ],
+        )
+    else:
+        common_feature = None
+
+    if (ctx.attr.cpu == "local"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            alwayslink_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            build_id_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+            unfiltered_compile_flags_feature,
+            rocm_link_flags_feature,
+        ]
+    elif (ctx.attr.cpu == "darwin"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            undefined_dynamic_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+    elif (ctx.attr.cpu == "x64_windows"):
+        features = [
+            no_legacy_features_feature,
+            redirector_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            static_link_msvcrt_no_debug_feature,
+            dynamic_link_msvcrt_no_debug_feature,
+            static_link_msvcrt_debug_feature,
+            dynamic_link_msvcrt_debug_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        fail("Unreachable")
+
+    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
+
+    if (ctx.attr.cpu == "x64_windows"):
+        tool_paths = [
+            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
+            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
+            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
+            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(
+                name = "objcopy",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "objdump",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "strip",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+        ]
+    elif (ctx.attr.cpu == "local"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    elif (ctx.attr.cpu == "darwin"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
     out = ctx.actions.declare_file(ctx.label.name)
     ctx.actions.write(out, "Fake executable")
     return [
@@ -57,7 +1462,7 @@ def _impl(ctx):
             ctx = ctx,
             features = features,
             action_configs = action_configs,
-            artifact_name_patterns = artifact_name_patterns,
+            artifact_name_patterns = [],
             cxx_builtin_include_directories = cxx_builtin_include_directories,
             toolchain_identifier = toolchain_identifier,
             host_system_name = host_system_name,
@@ -68,7 +1473,7 @@ def _impl(ctx):
             abi_version = abi_version,
             abi_libc_version = abi_libc_version,
             tool_paths = tool_paths,
-            make_variables = make_variables,
+            make_variables = [],
             builtin_sysroot = builtin_sysroot,
             cc_target_os = cc_target_os,
         ),
@@ -80,8 +1485,22 @@ def _impl(ctx):
 cc_toolchain_config = rule(
     implementation = _impl,
     attrs = {
-        "cpu": attr.string(mandatory = True, values = ["local"]),
-        "host_compiler_includes": attr.string_list(),
+        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "msvc_cl_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_include": attr.string(default = "msvc_not_used"),
+        "msvc_env_lib": attr.string(default = "msvc_not_used"),
+        "msvc_env_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
+        "msvc_lib_path": attr.string(default = "msvc_not_used"),
+        "msvc_link_path": attr.string(default = "msvc_not_used"),
+        "msvc_ml_path": attr.string(default = "msvc_not_used"),
     },
     provides = [CcToolchainConfigInfo],
     executable = True,
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index ac3b8653a4c..f4bc777568e 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -76,7 +76,7 @@ cc_import(
 )
 
 cuda_header_library(
-    name = "cublas_virtual_headers",
+    name = "cublas_headers",
     hdrs = [":cublas-include"],
     include_prefix = "third_party/gpus/cuda/include",
     includes = ["cublas/include"],
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 89ddcb052a3..cf63adcbaa2 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -194,7 +194,8 @@ def _get_win_cuda_defines(repository_ctx):
     )
 
     # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
-    # The generated files are guranteed to have unique name, so they can share the same tmp directory
+    # The generated files are guaranteed to have unique name, so they can share
+    # the same tmp directory
     escaped_cxx_include_directories = [
         _get_nvcc_tmp_dir_for_windows(repository_ctx),
     ]
@@ -1131,7 +1132,7 @@ def _create_local_cuda_repository(repository_ctx):
         # switch it off for now.
         "-Wno-invalid-partial-specialization"
     """
-        cuda_defines["%{cxx_builtin_include_directories}"] = host_compiler_includes
+        cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes)
         cuda_defines["%{linker_files}"] = ":empty"
         cuda_defines["%{win_linker_files}"] = ":empty"
         repository_ctx.file(
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index d948593e518..be59515661a 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -141,7 +141,10 @@ def _get_ld_config_paths():
   pattern = re.compile(".* => (.*)")
   result = set()
   for line in output.splitlines():
-    match = pattern.match(line.decode("ascii"))
+    try:
+      match = pattern.match(line.decode("ascii"))
+    except UnicodeDecodeError:
+      match = False
     if match:
       result.add(os.path.dirname(match.group(1)))
   return sorted(list(result))
@@ -384,20 +387,24 @@ def _find_tensorrt_config(base_paths, required_version):
         _get_header_version(path, name)
         for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR",
                      "NV_TENSORRT_PATCH"))
+    # `version` is a generator object, so we convert it to a list before using
+    # it (muitiple times below).
+    version = list(version)
+    if not all(version):
+      return None  # Versions not found, make _matches_version returns False.
     return ".".join(version)
 
-  header_path, header_version = _find_header(base_paths, "NvInfer.h",
-                                             required_version,
-                                             get_header_version)
-
-  if ".." in header_version:
-    # From TRT 6.0 onwards, version information has been moved to NvInferVersion.h.
+  try:
+    header_path, header_version = _find_header(base_paths, "NvInfer.h",
+                                               required_version,
+                                               get_header_version)
+  except ConfigError:
+    # TensorRT 6 moved the version information to NvInferVersion.h.
     header_path, header_version = _find_header(base_paths, "NvInferVersion.h",
                                                required_version,
                                                get_header_version)
 
   tensorrt_version = header_version.split(".")[0]
-
   library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
 
   return {
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 1e77bba6a9b..985e791216c 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -23,6 +23,7 @@ load(
 )
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
 _ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
 _TF_ROCM_VERSION = "TF_ROCM_VERSION"
 _TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION"
@@ -34,6 +35,22 @@ _DEFAULT_MIOPEN_VERSION = ""
 _DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
 _DEFAULT_ROCM_AMDGPU_TARGETS = ["gfx803", "gfx900"]
 
+def _get_win_rocm_defines(repository_ctx):
+    """Return CROSSTOOL defines for Windows"""
+
+    # Return fake vaules for Windows specific fields.
+    # This ensures the CROSSTOOL file parser is happy.
+    return {
+        "%{msvc_env_tmp}": "msvc_not_used",
+        "%{msvc_env_path}": "msvc_not_used",
+        "%{msvc_env_include}": "msvc_not_used",
+        "%{msvc_env_lib}": "msvc_not_used",
+        "%{msvc_cl_path}": "msvc_not_used",
+        "%{msvc_ml_path}": "msvc_not_used",
+        "%{msvc_link_path}": "msvc_not_used",
+        "%{msvc_lib_path}": "msvc_not_used",
+    }
+
 def find_cc(repository_ctx):
     """Find the C++ compiler."""
 
@@ -666,28 +683,61 @@ def _create_local_rocm_repository(repository_ctx):
 
     # Set up crosstool/
     cc = find_cc(repository_ctx)
-    host_compiler_includes = to_list_of_strings(
-        _host_compiler_includes(repository_ctx, cc),
-    )
-    rocm_defines = {
-        "%{linker_files}": ":empty",
-        "%{win_linker_files}": ":empty",
-        "%{cxx_builtin_include_directories}": to_list_of_strings(
-            host_compiler_includes + _rocm_include_path(
-                repository_ctx,
-                rocm_config,
-            ),
-        ),
-        "%{clang_path}": str(cc),
-        "%{unfiltered_compile_flags}": "\"-DTENSORFLOW_USE_ROCM\"",
-    }
 
-    # Set up crosstool/
+    host_compiler_includes = get_cxx_inc_directories(repository_ctx, cc)
+
+    host_compiler_prefix = "/usr/bin"
+    if _GCC_HOST_COMPILER_PREFIX in repository_ctx.os.environ:
+        host_compiler_prefix = repository_ctx.os.environ[_GCC_HOST_COMPILER_PREFIX].strip()
+
+    rocm_defines = {}
+
+    rocm_defines["%{host_compiler_prefix}"] = host_compiler_prefix
+
+    rocm_defines["%{linker_bin_path}"] = "/opt/rocm/hcc/compiler/bin"
+
+    # For gcc, do not canonicalize system header paths; some versions of gcc
+    # pick the shortest possible path for system includes when creating the
+    # .d file - given that includes that are prefixed with "../" multiple
+    # time quickly grow longer than the root of the tree, this can lead to
+    # bazel's header check failing.
+    rocm_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
+
+    rocm_defines["%{unfiltered_compile_flags}"] = to_list_of_strings(["-DTENSORFLOW_USE_ROCM", "-D__HIP_PLATFORM_HCC__", "-DEIGEN_USE_HIP"])
+
+    rocm_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_rocm"
+
+    # # Enable a few more warnings that aren't part of -Wall.
+    # compiler_flag: "-Wunused-but-set-parameter"
+
+    # # But disable some that are problematic.
+    # compiler_flag: "-Wno-free-nonheap-object" # has false positives
+
+    rocm_defines["%{host_compiler_warnings}"] = to_list_of_strings(["-Wunused-but-set-parameter", "-Wno-free-nonheap-object"])
+
+    rocm_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(host_compiler_includes +
+                                                                            _rocm_include_path(repository_ctx, rocm_config))
+
+    rocm_defines["%{linker_files}"] = "clang/bin/crosstool_wrapper_driver_rocm"
+
+    rocm_defines["%{win_linker_files}"] = ":empty"
+
+    # Add the dummy defines for windows...requried to pass the "verify_build_defines" check
+    rocm_defines.update(_get_win_rocm_defines(repository_ctx))
+
     verify_build_defines(rocm_defines)
+
+    # Only expand template variables in the BUILD file
     _tpl(repository_ctx, "crosstool:BUILD", rocm_defines)
 
     # No templating of cc_toolchain_config - use attributes and templatize the
     # BUILD file.
+    _tpl(
+        repository_ctx,
+        "crosstool:hipcc_cc_toolchain_config.bzl",
+        out = "crosstool/cc_toolchain_config.bzl",
+    )
+
     _tpl(
         repository_ctx,
         "crosstool:clang/bin/crosstool_wrapper_driver_rocm",
@@ -761,6 +811,7 @@ rocm_configure = repository_rule(
     implementation = _rocm_autoconf_impl,
     environ = [
         _GCC_HOST_COMPILER_PATH,
+        _GCC_HOST_COMPILER_PREFIX,
         "TF_NEED_ROCM",
         _ROCM_TOOLKIT_PATH,
         _TF_ROCM_VERSION,
diff --git a/third_party/hadoop/BUILD b/third_party/hadoop/BUILD
index c3c5e428be0..563136afe3a 100644
--- a/third_party/hadoop/BUILD
+++ b/third_party/hadoop/BUILD
@@ -1,6 +1,7 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
 
 exports_files(["LICENSE.txt"])
 
diff --git a/third_party/hwloc/BUILD b/third_party/hwloc/BUILD
index 2469c95668a..ce6d3712564 100644
--- a/third_party/hwloc/BUILD
+++ b/third_party/hwloc/BUILD
@@ -1,6 +1,6 @@
 # BUILD file to make this directory a package.
 
-licenses(["notice"])
+package(licenses = ["notice"])
 
 exports_files(
     ["static-components.h"],
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index e0685fd0ad1..c380606f9eb 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -399,299 +399,17 @@ filegroup(
 )
 
 cc_library(
-    name = "global_i_sel",
+    name = "aarch64_asm_parser",
     srcs = glob([
-        "lib/CodeGen/GlobalISel/*.c",
-        "lib/CodeGen/GlobalISel/*.cpp",
-        "lib/CodeGen/GlobalISel/*.inc",
-        "lib/CodeGen/GlobalISel/*.h",
+        "lib/Target/AArch64/AsmParser/*.c",
+        "lib/Target/AArch64/AsmParser/*.cpp",
+        "lib/Target/AArch64/AsmParser/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/CodeGen/GlobalISel/*.h",
-        "include/llvm/CodeGen/GlobalISel/*.def",
-        "include/llvm/CodeGen/GlobalISel/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":support",
-        ":target",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "x_core_code_gen",
-    srcs = glob([
-        "lib/Target/XCore/*.c",
-        "lib/Target/XCore/*.cpp",
-        "lib/Target/XCore/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/XCore/*.h",
-        "include/llvm/Target/XCore/*.def",
-        "include/llvm/Target/XCore/*.inc",
-        "lib/Target/XCore/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/XCore"],
-    deps = [
-        ":analysis",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":selection_dag",
-        ":support",
-        ":target",
-        ":transform_utils",
-        ":x_core_desc",
-        ":x_core_info",
-    ],
-)
-
-cc_library(
-    name = "symbolize",
-    srcs = glob([
-        "lib/DebugInfo/Symbolize/*.c",
-        "lib/DebugInfo/Symbolize/*.cpp",
-        "lib/DebugInfo/Symbolize/*.inc",
-        "lib/DebugInfo/Symbolize/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/DebugInfo/Symbolize/*.h",
-        "include/llvm/DebugInfo/Symbolize/*.def",
-        "include/llvm/DebugInfo/Symbolize/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":debug_info_dwarf",
-        ":debug_info_pdb",
-        ":demangle",
-        ":object",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "powerpc_disassembler",
-    srcs = glob([
-        "lib/Target/PowerPC/Disassembler/*.c",
-        "lib/Target/PowerPC/Disassembler/*.cpp",
-        "lib/Target/PowerPC/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/PowerPC/Disassembler/*.h",
-        "include/llvm/Target/PowerPC/Disassembler/*.def",
-        "include/llvm/Target/PowerPC/Disassembler/*.inc",
-        "lib/Target/PowerPC/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
-    deps = [
-        ":config",
-        ":mc_disassembler",
-        ":powerpc_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "core",
-    srcs = glob([
-        "lib/IR/*.c",
-        "lib/IR/*.cpp",
-        "lib/IR/*.inc",
-        "include/llvm/Analysis/*.h",
-        "include/llvm/Bitcode/BitcodeReader.h",
-        "include/llvm/Bitcode/BitCodes.h",
-        "include/llvm/Bitcode/LLVMBitCodes.h",
-        "include/llvm/CodeGen/MachineValueType.h",
-        "include/llvm/CodeGen/ValueTypes.h",
-        "lib/IR/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/IR/*.h",
-        "include/llvm/IR/*.def",
-        "include/llvm/IR/*.inc",
-        "include/llvm/*.h",
-        "include/llvm/Analysis/*.def",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":attributes_compat_gen",
-        ":attributes_gen",
-        ":binary_format",
-        ":config",
-        ":intrinsic_enums_gen",
-        ":intrinsics_impl_gen",
-        ":remarks",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "arm_desc",
-    srcs = glob([
-        "lib/Target/ARM/MCTargetDesc/*.c",
-        "lib/Target/ARM/MCTargetDesc/*.cpp",
-        "lib/Target/ARM/MCTargetDesc/*.inc",
-        "lib/Target/ARM/*.h",
-        "include/llvm/CodeGen/GlobalISel/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARM/MCTargetDesc/*.h",
-        "include/llvm/Target/ARM/MCTargetDesc/*.def",
-        "include/llvm/Target/ARM/MCTargetDesc/*.inc",
-        "lib/Target/ARM/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
-    deps = [
-        ":arm_info",
-        ":arm_target_gen",
-        ":arm_utils",
-        ":attributes_gen",
-        ":config",
-        ":intrinsic_enums_gen",
-        ":intrinsics_impl_gen",
-        ":mc",
-        ":mc_disassembler",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "line_editor",
-    srcs = glob([
-        "lib/LineEditor/*.c",
-        "lib/LineEditor/*.cpp",
-        "lib/LineEditor/*.inc",
-        "lib/LineEditor/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/LineEditor/*.h",
-        "include/llvm/LineEditor/*.def",
-        "include/llvm/LineEditor/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "coverage",
-    srcs = glob([
-        "lib/ProfileData/Coverage/*.c",
-        "lib/ProfileData/Coverage/*.cpp",
-        "lib/ProfileData/Coverage/*.inc",
-        "lib/ProfileData/Coverage/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ProfileData/Coverage/*.h",
-        "include/llvm/ProfileData/Coverage/*.def",
-        "include/llvm/ProfileData/Coverage/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":core",
-        ":object",
-        ":profile_data",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "amdgpu_asm_parser",
-    srcs = glob([
-        "lib/Target/AMDGPU/AsmParser/*.c",
-        "lib/Target/AMDGPU/AsmParser/*.cpp",
-        "lib/Target/AMDGPU/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AMDGPU/AsmParser/*.h",
-        "include/llvm/Target/AMDGPU/AsmParser/*.def",
-        "include/llvm/Target/AMDGPU/AsmParser/*.inc",
-        "lib/Target/AMDGPU/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
-    deps = [
-        ":amdgpu_desc",
-        ":amdgpu_info",
-        ":amdgpu_utils",
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "riscv_utils",
-    srcs = glob([
-        "lib/Target/RISCV/Utils/*.c",
-        "lib/Target/RISCV/Utils/*.cpp",
-        "lib/Target/RISCV/Utils/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/RISCV/Utils/*.h",
-        "include/llvm/Target/RISCV/Utils/*.def",
-        "include/llvm/Target/RISCV/Utils/*.inc",
-        "lib/Target/RISCV/Utils/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "avr_code_gen",
-    srcs = glob([
-        "lib/Target/AVR/*.c",
-        "lib/Target/AVR/*.cpp",
-        "lib/Target/AVR/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AVR/*.h",
-        "include/llvm/Target/AVR/*.def",
-        "include/llvm/Target/AVR/*.inc",
-        "lib/Target/AVR/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
-    deps = [
-        ":asm_printer",
-        ":avr_desc",
-        ":avr_info",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":selection_dag",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "aarch64_disassembler",
-    srcs = glob([
-        "lib/Target/AArch64/Disassembler/*.c",
-        "lib/Target/AArch64/Disassembler/*.cpp",
-        "lib/Target/AArch64/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AArch64/Disassembler/*.h",
-        "include/llvm/Target/AArch64/Disassembler/*.def",
-        "include/llvm/Target/AArch64/Disassembler/*.inc",
-        "lib/Target/AArch64/Disassembler/*.h",
+        "include/llvm/Target/AArch64/AsmParser/*.h",
+        "include/llvm/Target/AArch64/AsmParser/*.def",
+        "include/llvm/Target/AArch64/AsmParser/*.inc",
+        "lib/Target/AArch64/AsmParser/*.h",
     ]),
     copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
     deps = [
@@ -700,702 +418,11 @@ cc_library(
         ":aarch64_utils",
         ":config",
         ":mc",
-        ":mc_disassembler",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mc",
-    srcs = glob([
-        "lib/MC/*.c",
-        "lib/MC/*.cpp",
-        "lib/MC/*.inc",
-        "lib/MC/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/MC/*.h",
-        "include/llvm/MC/*.def",
-        "include/llvm/MC/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":binary_format",
-        ":config",
-        ":debug_info_code_view",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "jit_link",
-    srcs = glob([
-        "lib/ExecutionEngine/JITLink/*.c",
-        "lib/ExecutionEngine/JITLink/*.cpp",
-        "lib/ExecutionEngine/JITLink/*.inc",
-        "lib/ExecutionEngine/JITLink/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ExecutionEngine/JITLink/*.h",
-        "include/llvm/ExecutionEngine/JITLink/*.def",
-        "include/llvm/ExecutionEngine/JITLink/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":binary_format",
-        ":config",
-        ":object",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mips_disassembler",
-    srcs = glob([
-        "lib/Target/Mips/Disassembler/*.c",
-        "lib/Target/Mips/Disassembler/*.cpp",
-        "lib/Target/Mips/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Mips/Disassembler/*.h",
-        "include/llvm/Target/Mips/Disassembler/*.def",
-        "include/llvm/Target/Mips/Disassembler/*.inc",
-        "lib/Target/Mips/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Mips"],
-    deps = [
-        ":config",
-        ":mc_disassembler",
-        ":mips_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "arm_code_gen",
-    srcs = glob([
-        "lib/Target/ARM/*.c",
-        "lib/Target/ARM/*.cpp",
-        "lib/Target/ARM/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARM/*.h",
-        "include/llvm/Target/ARM/*.def",
-        "include/llvm/Target/ARM/*.inc",
-        "lib/Target/ARM/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
-    deps = [
-        ":analysis",
-        ":arm_desc",
-        ":arm_info",
-        ":arm_utils",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":global_i_sel",
-        ":mc",
-        ":scalar",
-        ":selection_dag",
-        ":support",
-        ":target",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "powerpc_info",
-    srcs = glob([
-        "lib/Target/PowerPC/TargetInfo/*.c",
-        "lib/Target/PowerPC/TargetInfo/*.cpp",
-        "lib/Target/PowerPC/TargetInfo/*.inc",
-        "lib/Target/PowerPC/MCTargetDesc/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/PowerPC/TargetInfo/*.h",
-        "include/llvm/Target/PowerPC/TargetInfo/*.def",
-        "include/llvm/Target/PowerPC/TargetInfo/*.inc",
-        "lib/Target/PowerPC/PPC*.h",
-        "lib/Target/PowerPC/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
-    deps = [
-        ":attributes_gen",
-        ":config",
-        ":core",
-        ":powerpc_target_gen",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "system_z_disassembler",
-    srcs = glob([
-        "lib/Target/SystemZ/Disassembler/*.c",
-        "lib/Target/SystemZ/Disassembler/*.cpp",
-        "lib/Target/SystemZ/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/SystemZ/Disassembler/*.h",
-        "include/llvm/Target/SystemZ/Disassembler/*.def",
-        "include/llvm/Target/SystemZ/Disassembler/*.inc",
-        "lib/Target/SystemZ/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
-    deps = [
-        ":config",
-        ":mc",
-        ":mc_disassembler",
-        ":support",
-        ":system_z_desc",
-        ":system_z_info",
-    ],
-)
-
-cc_library(
-    name = "arc_code_gen",
-    srcs = glob([
-        "lib/Target/ARC/*.c",
-        "lib/Target/ARC/*.cpp",
-        "lib/Target/ARC/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARC/*.h",
-        "include/llvm/Target/ARC/*.def",
-        "include/llvm/Target/ARC/*.inc",
-        "lib/Target/ARC/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARC"],
-    deps = [
-        ":analysis",
-        ":arc_desc",
-        ":arc_info",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":selection_dag",
-        ":support",
-        ":target",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "lanai_info",
-    srcs = glob([
-        "lib/Target/Lanai/TargetInfo/*.c",
-        "lib/Target/Lanai/TargetInfo/*.cpp",
-        "lib/Target/Lanai/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Lanai/TargetInfo/*.h",
-        "include/llvm/Target/Lanai/TargetInfo/*.def",
-        "include/llvm/Target/Lanai/TargetInfo/*.inc",
-        "lib/Target/Lanai/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mips_desc",
-    srcs = glob([
-        "lib/Target/Mips/MCTargetDesc/*.c",
-        "lib/Target/Mips/MCTargetDesc/*.cpp",
-        "lib/Target/Mips/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Mips/MCTargetDesc/*.h",
-        "include/llvm/Target/Mips/MCTargetDesc/*.def",
-        "include/llvm/Target/Mips/MCTargetDesc/*.inc",
-        "lib/Target/Mips/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Mips"],
-    deps = [
-        ":config",
-        ":mc",
-        ":mips_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "fuzz_mutate",
-    srcs = glob([
-        "lib/FuzzMutate/*.c",
-        "lib/FuzzMutate/*.cpp",
-        "lib/FuzzMutate/*.inc",
-        "lib/FuzzMutate/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/FuzzMutate/*.h",
-        "include/llvm/FuzzMutate/*.def",
-        "include/llvm/FuzzMutate/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":bit_reader",
-        ":bit_writer",
-        ":config",
-        ":core",
-        ":scalar",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "vectorize",
-    srcs = glob([
-        "lib/Transforms/Vectorize/*.c",
-        "lib/Transforms/Vectorize/*.cpp",
-        "lib/Transforms/Vectorize/*.inc",
-        "include/llvm-c/Transforms/Vectorize.h",
-        "lib/Transforms/Vectorize/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Transforms/Vectorize/*.h",
-        "include/llvm/Transforms/Vectorize/*.def",
-        "include/llvm/Transforms/Vectorize/*.inc",
-        "include/llvm/Transforms/Vectorize.h",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":scalar",
-        ":support",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "web_assembly_disassembler",
-    srcs = glob([
-        "lib/Target/WebAssembly/Disassembler/*.c",
-        "lib/Target/WebAssembly/Disassembler/*.cpp",
-        "lib/Target/WebAssembly/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/WebAssembly/Disassembler/*.h",
-        "include/llvm/Target/WebAssembly/Disassembler/*.def",
-        "include/llvm/Target/WebAssembly/Disassembler/*.inc",
-        "lib/Target/WebAssembly/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
-    deps = [
-        ":config",
-        ":mc_disassembler",
-        ":support",
-        ":web_assembly_desc",
-        ":web_assembly_info",
-    ],
-)
-
-cc_library(
-    name = "aarch64_info",
-    srcs = glob([
-        "lib/Target/AArch64/TargetInfo/*.c",
-        "lib/Target/AArch64/TargetInfo/*.cpp",
-        "lib/Target/AArch64/TargetInfo/*.inc",
-        "lib/Target/AArch64/MCTargetDesc/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AArch64/TargetInfo/*.h",
-        "include/llvm/Target/AArch64/TargetInfo/*.def",
-        "include/llvm/Target/AArch64/TargetInfo/*.inc",
-        "lib/Target/AArch64/*.def",
-        "lib/Target/AArch64/AArch64*.h",
-        "lib/Target/AArch64/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
-    deps = [
-        ":code_gen",
-        ":config",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "arm_asm_parser",
-    srcs = glob([
-        "lib/Target/ARM/AsmParser/*.c",
-        "lib/Target/ARM/AsmParser/*.cpp",
-        "lib/Target/ARM/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARM/AsmParser/*.h",
-        "include/llvm/Target/ARM/AsmParser/*.def",
-        "include/llvm/Target/ARM/AsmParser/*.inc",
-        "lib/Target/ARM/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
-    deps = [
-        ":arm_desc",
-        ":arm_info",
-        ":arm_utils",
-        ":config",
-        ":mc",
         ":mc_parser",
         ":support",
     ],
 )
 
-cc_library(
-    name = "msp430_info",
-    srcs = glob([
-        "lib/Target/MSP430/TargetInfo/*.c",
-        "lib/Target/MSP430/TargetInfo/*.cpp",
-        "lib/Target/MSP430/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/MSP430/TargetInfo/*.h",
-        "include/llvm/Target/MSP430/TargetInfo/*.def",
-        "include/llvm/Target/MSP430/TargetInfo/*.inc",
-        "lib/Target/MSP430/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/MSP430"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "system_z_asm_parser",
-    srcs = glob([
-        "lib/Target/SystemZ/AsmParser/*.c",
-        "lib/Target/SystemZ/AsmParser/*.cpp",
-        "lib/Target/SystemZ/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/SystemZ/AsmParser/*.h",
-        "include/llvm/Target/SystemZ/AsmParser/*.def",
-        "include/llvm/Target/SystemZ/AsmParser/*.inc",
-        "lib/Target/SystemZ/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
-    deps = [
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":support",
-        ":system_z_desc",
-        ":system_z_info",
-    ],
-)
-
-cc_library(
-    name = "riscv_asm_parser",
-    srcs = glob([
-        "lib/Target/RISCV/AsmParser/*.c",
-        "lib/Target/RISCV/AsmParser/*.cpp",
-        "lib/Target/RISCV/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/RISCV/AsmParser/*.h",
-        "include/llvm/Target/RISCV/AsmParser/*.def",
-        "include/llvm/Target/RISCV/AsmParser/*.inc",
-        "lib/Target/RISCV/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
-    deps = [
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":riscv_desc",
-        ":riscv_info",
-        ":riscv_utils",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "web_assembly_desc",
-    srcs = glob([
-        "lib/Target/WebAssembly/MCTargetDesc/*.c",
-        "lib/Target/WebAssembly/MCTargetDesc/*.cpp",
-        "lib/Target/WebAssembly/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/WebAssembly/MCTargetDesc/*.h",
-        "include/llvm/Target/WebAssembly/MCTargetDesc/*.def",
-        "include/llvm/Target/WebAssembly/MCTargetDesc/*.inc",
-        "lib/Target/WebAssembly/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
-    deps = [
-        ":config",
-        ":mc",
-        ":support",
-        ":web_assembly_info",
-    ],
-)
-
-cc_library(
-    name = "arc_desc",
-    srcs = glob([
-        "lib/Target/ARC/MCTargetDesc/*.c",
-        "lib/Target/ARC/MCTargetDesc/*.cpp",
-        "lib/Target/ARC/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARC/MCTargetDesc/*.h",
-        "include/llvm/Target/ARC/MCTargetDesc/*.def",
-        "include/llvm/Target/ARC/MCTargetDesc/*.inc",
-        "lib/Target/ARC/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARC"],
-    deps = [
-        ":arc_info",
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "riscv_disassembler",
-    srcs = glob([
-        "lib/Target/RISCV/Disassembler/*.c",
-        "lib/Target/RISCV/Disassembler/*.cpp",
-        "lib/Target/RISCV/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/RISCV/Disassembler/*.h",
-        "include/llvm/Target/RISCV/Disassembler/*.def",
-        "include/llvm/Target/RISCV/Disassembler/*.inc",
-        "lib/Target/RISCV/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
-    deps = [
-        ":config",
-        ":mc_disassembler",
-        ":riscv_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "x86_utils",
-    srcs = glob([
-        "lib/Target/X86/Utils/*.c",
-        "lib/Target/X86/Utils/*.cpp",
-        "lib/Target/X86/Utils/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/X86/Utils/*.h",
-        "include/llvm/Target/X86/Utils/*.def",
-        "include/llvm/Target/X86/Utils/*.inc",
-        "lib/Target/X86/Utils/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
-    deps = [
-        ":code_gen",
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "powerpc_code_gen",
-    srcs = glob([
-        "lib/Target/PowerPC/*.c",
-        "lib/Target/PowerPC/*.cpp",
-        "lib/Target/PowerPC/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/PowerPC/*.h",
-        "include/llvm/Target/PowerPC/*.def",
-        "include/llvm/Target/PowerPC/*.inc",
-        "lib/Target/PowerPC/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
-    deps = [
-        ":analysis",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":powerpc_desc",
-        ":powerpc_info",
-        ":scalar",
-        ":selection_dag",
-        ":support",
-        ":target",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "aggressive_inst_combine",
-    srcs = glob([
-        "lib/Transforms/AggressiveInstCombine/*.c",
-        "lib/Transforms/AggressiveInstCombine/*.cpp",
-        "lib/Transforms/AggressiveInstCombine/*.inc",
-        "lib/Transforms/AggressiveInstCombine/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Transforms/AggressiveInstCombine/*.h",
-        "include/llvm/Transforms/AggressiveInstCombine/*.def",
-        "include/llvm/Transforms/AggressiveInstCombine/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":support",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "avr_asm_parser",
-    srcs = glob([
-        "lib/Target/AVR/AsmParser/*.c",
-        "lib/Target/AVR/AsmParser/*.cpp",
-        "lib/Target/AVR/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AVR/AsmParser/*.h",
-        "include/llvm/Target/AVR/AsmParser/*.def",
-        "include/llvm/Target/AVR/AsmParser/*.inc",
-        "lib/Target/AVR/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
-    deps = [
-        ":avr_desc",
-        ":avr_info",
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "remarks",
-    srcs = glob([
-        "lib/Remarks/*.c",
-        "lib/Remarks/*.cpp",
-        "lib/Remarks/*.inc",
-        "lib/Remarks/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Remarks/*.h",
-        "include/llvm/Remarks/*.def",
-        "include/llvm/Remarks/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "debug_info_dwarf",
-    srcs = glob([
-        "lib/DebugInfo/DWARF/*.c",
-        "lib/DebugInfo/DWARF/*.cpp",
-        "lib/DebugInfo/DWARF/*.inc",
-        "lib/DebugInfo/DWARF/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/DebugInfo/DWARF/*.h",
-        "include/llvm/DebugInfo/DWARF/*.def",
-        "include/llvm/DebugInfo/DWARF/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":binary_format",
-        ":config",
-        ":mc",
-        ":object",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "avr_info",
-    srcs = glob([
-        "lib/Target/AVR/TargetInfo/*.c",
-        "lib/Target/AVR/TargetInfo/*.cpp",
-        "lib/Target/AVR/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AVR/TargetInfo/*.h",
-        "include/llvm/Target/AVR/TargetInfo/*.def",
-        "include/llvm/Target/AVR/TargetInfo/*.inc",
-        "lib/Target/AVR/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
-    deps = [
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "coroutines",
-    srcs = glob([
-        "lib/Transforms/Coroutines/*.c",
-        "lib/Transforms/Coroutines/*.cpp",
-        "lib/Transforms/Coroutines/*.inc",
-        "lib/Transforms/Coroutines/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Transforms/Coroutines/*.h",
-        "include/llvm/Transforms/Coroutines/*.def",
-        "include/llvm/Transforms/Coroutines/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":ipo",
-        ":scalar",
-        ":support",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "binary_format",
-    srcs = glob([
-        "lib/BinaryFormat/*.c",
-        "lib/BinaryFormat/*.cpp",
-        "lib/BinaryFormat/*.inc",
-        "lib/BinaryFormat/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/BinaryFormat/*.h",
-        "include/llvm/BinaryFormat/*.def",
-        "include/llvm/BinaryFormat/*.inc",
-        "include/llvm/BinaryFormat/ELFRelocs/*.def",
-        "include/llvm/BinaryFormat/WasmRelocs/*.def",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
 cc_library(
     name = "aarch64_code_gen",
     srcs = glob([
@@ -1429,802 +456,51 @@ cc_library(
 )
 
 cc_library(
-    name = "bpf_asm_parser",
+    name = "aarch64_desc",
     srcs = glob([
-        "lib/Target/BPF/AsmParser/*.c",
-        "lib/Target/BPF/AsmParser/*.cpp",
-        "lib/Target/BPF/AsmParser/*.inc",
+        "lib/Target/AArch64/MCTargetDesc/*.c",
+        "lib/Target/AArch64/MCTargetDesc/*.cpp",
+        "lib/Target/AArch64/MCTargetDesc/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/Target/BPF/AsmParser/*.h",
-        "include/llvm/Target/BPF/AsmParser/*.def",
-        "include/llvm/Target/BPF/AsmParser/*.inc",
-        "lib/Target/BPF/AsmParser/*.h",
+        "include/llvm/Target/AArch64/MCTargetDesc/*.h",
+        "include/llvm/Target/AArch64/MCTargetDesc/*.def",
+        "include/llvm/Target/AArch64/MCTargetDesc/*.inc",
+        "lib/Target/AArch64/MCTargetDesc/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
-    deps = [
-        ":bpf_desc",
-        ":bpf_info",
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "arc_disassembler",
-    srcs = glob([
-        "lib/Target/ARC/Disassembler/*.c",
-        "lib/Target/ARC/Disassembler/*.cpp",
-        "lib/Target/ARC/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARC/Disassembler/*.h",
-        "include/llvm/Target/ARC/Disassembler/*.def",
-        "include/llvm/Target/ARC/Disassembler/*.inc",
-        "lib/Target/ARC/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARC"],
-    deps = [
-        ":arc_info",
-        ":config",
-        ":mc_disassembler",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "web_assembly_code_gen",
-    srcs = glob([
-        "lib/Target/WebAssembly/*.c",
-        "lib/Target/WebAssembly/*.cpp",
-        "lib/Target/WebAssembly/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/WebAssembly/*.h",
-        "include/llvm/Target/WebAssembly/*.def",
-        "include/llvm/Target/WebAssembly/*.inc",
-        "lib/Target/WebAssembly/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
-    deps = [
-        ":analysis",
-        ":asm_printer",
-        ":binary_format",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":scalar",
-        ":selection_dag",
-        ":support",
-        ":target",
-        ":transform_utils",
-        ":web_assembly_desc",
-        ":web_assembly_info",
-    ],
-)
-
-cc_library(
-    name = "bpf_desc",
-    srcs = glob([
-        "lib/Target/BPF/MCTargetDesc/*.c",
-        "lib/Target/BPF/MCTargetDesc/*.cpp",
-        "lib/Target/BPF/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/BPF/MCTargetDesc/*.h",
-        "include/llvm/Target/BPF/MCTargetDesc/*.def",
-        "include/llvm/Target/BPF/MCTargetDesc/*.inc",
-        "lib/Target/BPF/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
-    deps = [
-        ":bpf_info",
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mir_parser",
-    srcs = glob([
-        "lib/CodeGen/MIRParser/*.c",
-        "lib/CodeGen/MIRParser/*.cpp",
-        "lib/CodeGen/MIRParser/*.inc",
-        "lib/CodeGen/MIRParser/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/CodeGen/MIRParser/*.h",
-        "include/llvm/CodeGen/MIRParser/*.def",
-        "include/llvm/CodeGen/MIRParser/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":asm_parser",
-        ":binary_format",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "sparc_asm_parser",
-    srcs = glob([
-        "lib/Target/Sparc/AsmParser/*.c",
-        "lib/Target/Sparc/AsmParser/*.cpp",
-        "lib/Target/Sparc/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Sparc/AsmParser/*.h",
-        "include/llvm/Target/Sparc/AsmParser/*.def",
-        "include/llvm/Target/Sparc/AsmParser/*.inc",
-        "lib/Target/Sparc/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
-    deps = [
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":sparc_desc",
-        ":sparc_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "arm_disassembler",
-    srcs = glob([
-        "lib/Target/ARM/Disassembler/*.c",
-        "lib/Target/ARM/Disassembler/*.cpp",
-        "lib/Target/ARM/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARM/Disassembler/*.h",
-        "include/llvm/Target/ARM/Disassembler/*.def",
-        "include/llvm/Target/ARM/Disassembler/*.inc",
-        "lib/Target/ARM/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
-    deps = [
-        ":arm_desc",
-        ":arm_info",
-        ":arm_utils",
-        ":config",
-        ":mc_disassembler",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "selection_dag",
-    srcs = glob([
-        "lib/CodeGen/SelectionDAG/*.c",
-        "lib/CodeGen/SelectionDAG/*.cpp",
-        "lib/CodeGen/SelectionDAG/*.inc",
-        "lib/CodeGen/SelectionDAG/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/CodeGen/SelectionDAG/*.h",
-        "include/llvm/CodeGen/SelectionDAG/*.def",
-        "include/llvm/CodeGen/SelectionDAG/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":support",
-        ":target",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "lanai_asm_parser",
-    srcs = glob([
-        "lib/Target/Lanai/AsmParser/*.c",
-        "lib/Target/Lanai/AsmParser/*.cpp",
-        "lib/Target/Lanai/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Lanai/AsmParser/*.h",
-        "include/llvm/Target/Lanai/AsmParser/*.def",
-        "include/llvm/Target/Lanai/AsmParser/*.inc",
-        "lib/Target/Lanai/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
-    deps = [
-        ":config",
-        ":lanai_desc",
-        ":lanai_info",
-        ":mc",
-        ":mc_parser",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "system_z_desc",
-    srcs = glob([
-        "lib/Target/SystemZ/MCTargetDesc/*.c",
-        "lib/Target/SystemZ/MCTargetDesc/*.cpp",
-        "lib/Target/SystemZ/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/SystemZ/MCTargetDesc/*.h",
-        "include/llvm/Target/SystemZ/MCTargetDesc/*.def",
-        "include/llvm/Target/SystemZ/MCTargetDesc/*.inc",
-        "lib/Target/SystemZ/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
-    deps = [
-        ":config",
-        ":mc",
-        ":support",
-        ":system_z_info",
-    ],
-)
-
-cc_library(
-    name = "asm_parser",
-    srcs = glob([
-        "lib/AsmParser/*.c",
-        "lib/AsmParser/*.cpp",
-        "lib/AsmParser/*.inc",
-        "lib/AsmParser/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/AsmParser/*.h",
-        "include/llvm/AsmParser/*.def",
-        "include/llvm/AsmParser/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":binary_format",
-        ":config",
-        ":core",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "avr_desc",
-    srcs = glob([
-        "lib/Target/AVR/MCTargetDesc/*.c",
-        "lib/Target/AVR/MCTargetDesc/*.cpp",
-        "lib/Target/AVR/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AVR/MCTargetDesc/*.h",
-        "include/llvm/Target/AVR/MCTargetDesc/*.def",
-        "include/llvm/Target/AVR/MCTargetDesc/*.inc",
-        "lib/Target/AVR/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
-    deps = [
-        ":avr_info",
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "gtest_main",
-    srcs = glob([
-        "utils/unittest/*.c",
-        "utils/unittest/*.cpp",
-        "utils/unittest/*.inc",
-        "utils/unittest/*.h",
-    ]),
-    hdrs = glob([
-        "utils/unittest/*.h",
-        "utils/unittest/*.def",
-        "utils/unittest/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":gtest",
-    ],
-)
-
-cc_library(
-    name = "x86_disassembler",
-    srcs = glob([
-        "lib/Target/X86/Disassembler/*.c",
-        "lib/Target/X86/Disassembler/*.cpp",
-        "lib/Target/X86/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/X86/Disassembler/*.h",
-        "include/llvm/Target/X86/Disassembler/*.def",
-        "include/llvm/Target/X86/Disassembler/*.inc",
-        "lib/Target/X86/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
-    deps = [
-        ":config",
-        ":mc_disassembler",
-        ":support",
-        ":x86_info",
-    ],
-)
-
-cc_library(
-    name = "amdgpu_utils",
-    srcs = glob([
-        "lib/Target/AMDGPU/Utils/*.c",
-        "lib/Target/AMDGPU/Utils/*.cpp",
-        "lib/Target/AMDGPU/Utils/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AMDGPU/Utils/*.h",
-        "include/llvm/Target/AMDGPU/Utils/*.def",
-        "include/llvm/Target/AMDGPU/Utils/*.inc",
-        "lib/Target/AMDGPU/Utils/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
-    deps = [
-        ":amdgpu_r600_target_gen",
-        ":amdgpu_target_gen",
-        ":binary_format",
-        ":config",
-        ":core",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "hexagon_info",
-    srcs = glob([
-        "lib/Target/Hexagon/TargetInfo/*.c",
-        "lib/Target/Hexagon/TargetInfo/*.cpp",
-        "lib/Target/Hexagon/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Hexagon/TargetInfo/*.h",
-        "include/llvm/Target/Hexagon/TargetInfo/*.def",
-        "include/llvm/Target/Hexagon/TargetInfo/*.inc",
-        "lib/Target/Hexagon/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Hexagon"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "nvptx_desc",
-    srcs = glob([
-        "lib/Target/NVPTX/MCTargetDesc/*.c",
-        "lib/Target/NVPTX/MCTargetDesc/*.cpp",
-        "lib/Target/NVPTX/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/NVPTX/MCTargetDesc/*.h",
-        "include/llvm/Target/NVPTX/MCTargetDesc/*.def",
-        "include/llvm/Target/NVPTX/MCTargetDesc/*.inc",
-        "lib/Target/NVPTX/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/NVPTX"],
-    deps = [
-        "nvptx_target_gen",
-        ":config",
-        ":mc",
-        ":nvptx_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "bit_reader",
-    srcs = glob([
-        "lib/Bitcode/Reader/*.c",
-        "lib/Bitcode/Reader/*.cpp",
-        "lib/Bitcode/Reader/*.inc",
-        "lib/Bitcode/Reader/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Bitcode/Reader/*.h",
-        "include/llvm/Bitcode/Reader/*.def",
-        "include/llvm/Bitcode/Reader/*.inc",
-        "include/llvm/Bitcode/BitstreamReader.h",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":core",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "riscv_info",
-    srcs = glob([
-        "lib/Target/RISCV/TargetInfo/*.c",
-        "lib/Target/RISCV/TargetInfo/*.cpp",
-        "lib/Target/RISCV/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/RISCV/TargetInfo/*.h",
-        "include/llvm/Target/RISCV/TargetInfo/*.def",
-        "include/llvm/Target/RISCV/TargetInfo/*.inc",
-        "lib/Target/RISCV/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "web_assembly_asm_parser",
-    srcs = glob([
-        "lib/Target/WebAssembly/AsmParser/*.c",
-        "lib/Target/WebAssembly/AsmParser/*.cpp",
-        "lib/Target/WebAssembly/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/WebAssembly/AsmParser/*.h",
-        "include/llvm/Target/WebAssembly/AsmParser/*.def",
-        "include/llvm/Target/WebAssembly/AsmParser/*.inc",
-        "lib/Target/WebAssembly/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
-    deps = [
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":support",
-        ":web_assembly_info",
-    ],
-)
-
-cc_library(
-    name = "sparc_desc",
-    srcs = glob([
-        "lib/Target/Sparc/MCTargetDesc/*.c",
-        "lib/Target/Sparc/MCTargetDesc/*.cpp",
-        "lib/Target/Sparc/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Sparc/MCTargetDesc/*.h",
-        "include/llvm/Target/Sparc/MCTargetDesc/*.def",
-        "include/llvm/Target/Sparc/MCTargetDesc/*.inc",
-        "lib/Target/Sparc/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
-    deps = [
-        ":config",
-        ":mc",
-        ":sparc_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mips_info",
-    srcs = glob([
-        "lib/Target/Mips/TargetInfo/*.c",
-        "lib/Target/Mips/TargetInfo/*.cpp",
-        "lib/Target/Mips/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Mips/TargetInfo/*.h",
-        "include/llvm/Target/Mips/TargetInfo/*.def",
-        "include/llvm/Target/Mips/TargetInfo/*.inc",
-        "lib/Target/Mips/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Mips"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "arc_info",
-    srcs = glob([
-        "lib/Target/ARC/TargetInfo/*.c",
-        "lib/Target/ARC/TargetInfo/*.cpp",
-        "lib/Target/ARC/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARC/TargetInfo/*.h",
-        "include/llvm/Target/ARC/TargetInfo/*.def",
-        "include/llvm/Target/ARC/TargetInfo/*.inc",
-        "lib/Target/ARC/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARC"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "arm_info",
-    srcs = glob([
-        "lib/Target/ARM/TargetInfo/*.c",
-        "lib/Target/ARM/TargetInfo/*.cpp",
-        "lib/Target/ARM/TargetInfo/*.inc",
-        "lib/Target/ARM/MCTargetDesc/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/ARM/TargetInfo/*.h",
-        "include/llvm/Target/ARM/TargetInfo/*.def",
-        "include/llvm/Target/ARM/TargetInfo/*.inc",
-        "lib/Target/ARM/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
-    deps = [
-        ":arm_target_gen",
-        ":config",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "x_core_desc",
-    srcs = glob([
-        "lib/Target/XCore/MCTargetDesc/*.c",
-        "lib/Target/XCore/MCTargetDesc/*.cpp",
-        "lib/Target/XCore/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/XCore/MCTargetDesc/*.h",
-        "include/llvm/Target/XCore/MCTargetDesc/*.def",
-        "include/llvm/Target/XCore/MCTargetDesc/*.inc",
-        "lib/Target/XCore/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/XCore"],
-    deps = [
-        ":config",
-        ":mc",
-        ":support",
-        ":x_core_info",
-    ],
-)
-
-cc_library(
-    name = "x86_info",
-    srcs = glob([
-        "lib/Target/X86/TargetInfo/*.c",
-        "lib/Target/X86/TargetInfo/*.cpp",
-        "lib/Target/X86/TargetInfo/*.inc",
-        "lib/Target/X86/MCTargetDesc/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/X86/TargetInfo/*.h",
-        "include/llvm/Target/X86/TargetInfo/*.def",
-        "include/llvm/Target/X86/TargetInfo/*.inc",
-        "lib/Target/X86/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
-    deps = [
-        ":config",
-        ":mc",
-        ":support",
-        ":x86_target_gen",
-    ],
-)
-
-cc_library(
-    name = "gtest",
-    srcs = glob([
-        "utils/unittest/*.c",
-        "utils/unittest/*.cpp",
-        "utils/unittest/*.inc",
-        "utils/unittest/*.h",
-    ]),
-    hdrs = glob([
-        "utils/unittest/*.h",
-        "utils/unittest/*.def",
-        "utils/unittest/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "debug_info_pdb",
-    srcs = glob([
-        "lib/DebugInfo/PDB/*.c",
-        "lib/DebugInfo/PDB/*.cpp",
-        "lib/DebugInfo/PDB/*.inc",
-        "lib/DebugInfo/PDB/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/DebugInfo/PDB/*.h",
-        "include/llvm/DebugInfo/PDB/*.def",
-        "include/llvm/DebugInfo/PDB/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":debug_info_code_view",
-        ":debug_info_msf",
-        ":object",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "x86_asm_parser",
-    srcs = glob([
-        "lib/Target/X86/AsmParser/*.c",
-        "lib/Target/X86/AsmParser/*.cpp",
-        "lib/Target/X86/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/X86/AsmParser/*.h",
-        "include/llvm/Target/X86/AsmParser/*.def",
-        "include/llvm/Target/X86/AsmParser/*.inc",
-        "lib/Target/X86/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
-    deps = [
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":support",
-        ":x86_desc",
-        ":x86_info",
-    ],
-)
-
-cc_library(
-    name = "demangle",
-    srcs = glob([
-        "lib/Demangle/*.c",
-        "lib/Demangle/*.cpp",
-        "lib/Demangle/*.inc",
-        "lib/Demangle/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Demangle/*.h",
-        "include/llvm/Demangle/*.def",
-        "include/llvm/Demangle/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [":config"],
-)
-
-cc_library(
-    name = "x_ray",
-    srcs = glob([
-        "lib/XRay/*.c",
-        "lib/XRay/*.cpp",
-        "lib/XRay/*.inc",
-        "lib/XRay/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/XRay/*.h",
-        "include/llvm/XRay/*.def",
-        "include/llvm/XRay/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":object",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "riscv_desc",
-    srcs = glob([
-        "lib/Target/RISCV/MCTargetDesc/*.c",
-        "lib/Target/RISCV/MCTargetDesc/*.cpp",
-        "lib/Target/RISCV/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/RISCV/MCTargetDesc/*.h",
-        "include/llvm/Target/RISCV/MCTargetDesc/*.def",
-        "include/llvm/Target/RISCV/MCTargetDesc/*.inc",
-        "lib/Target/RISCV/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
-    deps = [
-        ":config",
-        ":mc",
-        ":riscv_info",
-        ":riscv_utils",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "code_gen",
-    srcs = glob([
-        "lib/CodeGen/*.c",
-        "lib/CodeGen/*.cpp",
-        "lib/CodeGen/*.inc",
-        "lib/CodeGen/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/CodeGen/*.h",
-        "include/llvm/CodeGen/*.def",
-        "include/llvm/CodeGen/*.inc",
-        "include/llvm/CodeGen/**/*.h",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":bit_reader",
-        ":bit_writer",
-        ":config",
-        ":core",
-        ":instrumentation",
-        ":mc",
-        ":profile_data",
-        ":scalar",
-        ":support",
-        ":target",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "powerpc_desc",
-    srcs = glob([
-        "lib/Target/PowerPC/MCTargetDesc/*.c",
-        "lib/Target/PowerPC/MCTargetDesc/*.cpp",
-        "lib/Target/PowerPC/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/PowerPC/MCTargetDesc/*.h",
-        "include/llvm/Target/PowerPC/MCTargetDesc/*.def",
-        "include/llvm/Target/PowerPC/MCTargetDesc/*.inc",
-        "lib/Target/PowerPC/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
     deps = [
+        ":aarch64_info",
+        ":aarch64_target_gen",
+        ":aarch64_utils",
         ":attributes_gen",
         ":config",
         ":intrinsic_enums_gen",
         ":intrinsics_impl_gen",
         ":mc",
-        ":powerpc_info",
-        ":powerpc_target_gen",
         ":support",
     ],
 )
 
 cc_library(
-    name = "lanai_desc",
+    name = "aarch64_disassembler",
     srcs = glob([
-        "lib/Target/Lanai/MCTargetDesc/*.c",
-        "lib/Target/Lanai/MCTargetDesc/*.cpp",
-        "lib/Target/Lanai/MCTargetDesc/*.inc",
+        "lib/Target/AArch64/Disassembler/*.c",
+        "lib/Target/AArch64/Disassembler/*.cpp",
+        "lib/Target/AArch64/Disassembler/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/Target/Lanai/MCTargetDesc/*.h",
-        "include/llvm/Target/Lanai/MCTargetDesc/*.def",
-        "include/llvm/Target/Lanai/MCTargetDesc/*.inc",
-        "lib/Target/Lanai/MCTargetDesc/*.h",
+        "include/llvm/Target/AArch64/Disassembler/*.h",
+        "include/llvm/Target/AArch64/Disassembler/*.def",
+        "include/llvm/Target/AArch64/Disassembler/*.inc",
+        "lib/Target/AArch64/Disassembler/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
     deps = [
+        ":aarch64_desc",
+        ":aarch64_info",
+        ":aarch64_utils",
         ":config",
-        ":lanai_info",
         ":mc",
         ":mc_disassembler",
         ":support",
@@ -2232,93 +508,65 @@ cc_library(
 )
 
 cc_library(
-    name = "sparc_code_gen",
+    name = "aarch64_info",
     srcs = glob([
-        "lib/Target/Sparc/*.c",
-        "lib/Target/Sparc/*.cpp",
-        "lib/Target/Sparc/*.inc",
+        "lib/Target/AArch64/TargetInfo/*.c",
+        "lib/Target/AArch64/TargetInfo/*.cpp",
+        "lib/Target/AArch64/TargetInfo/*.inc",
+        "lib/Target/AArch64/MCTargetDesc/*.h",
     ]),
     hdrs = glob([
-        "include/llvm/Target/Sparc/*.h",
-        "include/llvm/Target/Sparc/*.def",
-        "include/llvm/Target/Sparc/*.inc",
-        "lib/Target/Sparc/*.h",
+        "include/llvm/Target/AArch64/TargetInfo/*.h",
+        "include/llvm/Target/AArch64/TargetInfo/*.def",
+        "include/llvm/Target/AArch64/TargetInfo/*.inc",
+        "lib/Target/AArch64/*.def",
+        "lib/Target/AArch64/AArch64*.h",
+        "lib/Target/AArch64/TargetInfo/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
     deps = [
-        ":asm_printer",
         ":code_gen",
         ":config",
-        ":core",
-        ":mc",
-        ":selection_dag",
-        ":sparc_desc",
-        ":sparc_info",
         ":support",
         ":target",
     ],
 )
 
 cc_library(
-    name = "web_assembly_info",
+    name = "aarch64_utils",
     srcs = glob([
-        "lib/Target/WebAssembly/TargetInfo/*.c",
-        "lib/Target/WebAssembly/TargetInfo/*.cpp",
-        "lib/Target/WebAssembly/TargetInfo/*.inc",
+        "lib/Target/AArch64/Utils/*.c",
+        "lib/Target/AArch64/Utils/*.cpp",
+        "lib/Target/AArch64/Utils/*.inc",
+        "lib/Target/AArch64/MCTargetDesc/*.h",
     ]),
     hdrs = glob([
-        "include/llvm/Target/WebAssembly/TargetInfo/*.h",
-        "include/llvm/Target/WebAssembly/TargetInfo/*.def",
-        "include/llvm/Target/WebAssembly/TargetInfo/*.inc",
-        "lib/Target/WebAssembly/TargetInfo/*.h",
+        "include/llvm/Target/AArch64/Utils/*.h",
+        "include/llvm/Target/AArch64/Utils/*.def",
+        "include/llvm/Target/AArch64/Utils/*.inc",
+        "lib/Target/AArch64/Utils/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
     deps = [
+        ":aarch64_target_gen",
         ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "orc_jit",
-    srcs = glob([
-        "lib/ExecutionEngine/Orc/*.c",
-        "lib/ExecutionEngine/Orc/*.cpp",
-        "lib/ExecutionEngine/Orc/*.inc",
-        "lib/ExecutionEngine/Orc/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ExecutionEngine/Orc/*.h",
-        "include/llvm/ExecutionEngine/Orc/*.def",
-        "include/llvm/ExecutionEngine/Orc/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":core",
-        ":execution_engine",
-        ":jit_link",
         ":mc",
-        ":object",
-        ":runtime_dyld",
         ":support",
-        ":target",
-        ":transform_utils",
     ],
 )
 
 cc_library(
-    name = "amdgpu_disassembler",
+    name = "amdgpu_asm_parser",
     srcs = glob([
-        "lib/Target/AMDGPU/Disassembler/*.c",
-        "lib/Target/AMDGPU/Disassembler/*.cpp",
-        "lib/Target/AMDGPU/Disassembler/*.inc",
+        "lib/Target/AMDGPU/AsmParser/*.c",
+        "lib/Target/AMDGPU/AsmParser/*.cpp",
+        "lib/Target/AMDGPU/AsmParser/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/Target/AMDGPU/Disassembler/*.h",
-        "include/llvm/Target/AMDGPU/Disassembler/*.def",
-        "include/llvm/Target/AMDGPU/Disassembler/*.inc",
-        "lib/Target/AMDGPU/Disassembler/*.h",
+        "include/llvm/Target/AMDGPU/AsmParser/*.h",
+        "include/llvm/Target/AMDGPU/AsmParser/*.def",
+        "include/llvm/Target/AMDGPU/AsmParser/*.inc",
+        "lib/Target/AMDGPU/AsmParser/*.h",
     ]),
     copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
     deps = [
@@ -2327,438 +575,7 @@ cc_library(
         ":amdgpu_utils",
         ":config",
         ":mc",
-        ":mc_disassembler",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "riscv_code_gen",
-    srcs = glob([
-        "lib/Target/RISCV/*.c",
-        "lib/Target/RISCV/*.cpp",
-        "lib/Target/RISCV/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/RISCV/*.h",
-        "include/llvm/Target/RISCV/*.def",
-        "include/llvm/Target/RISCV/*.inc",
-        "lib/Target/RISCV/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
-    deps = [
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":riscv_desc",
-        ":riscv_info",
-        ":riscv_utils",
-        ":selection_dag",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "objc_arc",
-    srcs = glob([
-        "lib/Transforms/ObjCARC/*.c",
-        "lib/Transforms/ObjCARC/*.cpp",
-        "lib/Transforms/ObjCARC/*.inc",
-        "include/llvm/Transforms/ObjCARC.h",
-        "lib/Transforms/ObjCARC/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Transforms/ObjCARC/*.h",
-        "include/llvm/Transforms/ObjCARC/*.def",
-        "include/llvm/Transforms/ObjCARC/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":support",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "sparc_disassembler",
-    srcs = glob([
-        "lib/Target/Sparc/Disassembler/*.c",
-        "lib/Target/Sparc/Disassembler/*.cpp",
-        "lib/Target/Sparc/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Sparc/Disassembler/*.h",
-        "include/llvm/Target/Sparc/Disassembler/*.def",
-        "include/llvm/Target/Sparc/Disassembler/*.inc",
-        "lib/Target/Sparc/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
-    deps = [
-        ":config",
-        ":mc_disassembler",
-        ":sparc_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "support",
-    srcs = glob([
-        "lib/Support/*.c",
-        "lib/Support/*.cpp",
-        "lib/Support/*.inc",
-        "include/llvm-c/*.h",
-        "include/llvm/CodeGen/MachineValueType.h",
-        "include/llvm/BinaryFormat/COFF.h",
-        "include/llvm/BinaryFormat/MachO.h",
-        "lib/Support/*.h",
-    ]) + llvm_support_platform_specific_srcs_glob(),
-    hdrs = glob([
-        "include/llvm/Support/*.h",
-        "include/llvm/Support/*.def",
-        "include/llvm/Support/*.inc",
-        "include/llvm/ADT/*.h",
-        "include/llvm/Support/ELFRelocs/*.def",
-        "include/llvm/Support/WasmRelocs/*.def",
-    ]) + [
-        "include/llvm/BinaryFormat/MachO.def",
-        "include/llvm/Support/VCSRevision.h",
-    ],
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":demangle",
-        "@zlib_archive//:zlib",
-    ],
-)
-
-cc_library(
-    name = "lanai_disassembler",
-    srcs = glob([
-        "lib/Target/Lanai/Disassembler/*.c",
-        "lib/Target/Lanai/Disassembler/*.cpp",
-        "lib/Target/Lanai/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Lanai/Disassembler/*.h",
-        "include/llvm/Target/Lanai/Disassembler/*.def",
-        "include/llvm/Target/Lanai/Disassembler/*.inc",
-        "lib/Target/Lanai/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
-    deps = [
-        ":config",
-        ":lanai_desc",
-        ":lanai_info",
-        ":mc",
-        ":mc_disassembler",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mc_disassembler",
-    srcs = glob([
-        "lib/MC/MCDisassembler/*.c",
-        "lib/MC/MCDisassembler/*.cpp",
-        "lib/MC/MCDisassembler/*.inc",
-        "lib/MC/MCDisassembler/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/MC/MCDisassembler/*.h",
-        "include/llvm/MC/MCDisassembler/*.def",
-        "include/llvm/MC/MCDisassembler/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "bpf_code_gen",
-    srcs = glob([
-        "lib/Target/BPF/*.c",
-        "lib/Target/BPF/*.cpp",
-        "lib/Target/BPF/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/BPF/*.h",
-        "include/llvm/Target/BPF/*.def",
-        "include/llvm/Target/BPF/*.inc",
-        "lib/Target/BPF/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
-    deps = [
-        ":asm_printer",
-        ":bpf_desc",
-        ":bpf_info",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":selection_dag",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "amdgpu_info",
-    srcs = glob([
-        "lib/Target/AMDGPU/TargetInfo/*.c",
-        "lib/Target/AMDGPU/TargetInfo/*.cpp",
-        "lib/Target/AMDGPU/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AMDGPU/TargetInfo/*.h",
-        "include/llvm/Target/AMDGPU/TargetInfo/*.def",
-        "include/llvm/Target/AMDGPU/TargetInfo/*.inc",
-        "lib/Target/AMDGPU/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
-    deps = [
-        ":amdgpu_r600_target_gen",
-        ":amdgpu_target_gen",
-        ":config",
-        ":core",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "sparc_info",
-    srcs = glob([
-        "lib/Target/Sparc/TargetInfo/*.c",
-        "lib/Target/Sparc/TargetInfo/*.cpp",
-        "lib/Target/Sparc/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Sparc/TargetInfo/*.h",
-        "include/llvm/Target/Sparc/TargetInfo/*.def",
-        "include/llvm/Target/Sparc/TargetInfo/*.inc",
-        "lib/Target/Sparc/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "object_yaml",
-    srcs = glob([
-        "lib/ObjectYAML/*.c",
-        "lib/ObjectYAML/*.cpp",
-        "lib/ObjectYAML/*.inc",
-        "lib/ObjectYAML/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ObjectYAML/*.h",
-        "include/llvm/ObjectYAML/*.def",
-        "include/llvm/ObjectYAML/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":debug_info_code_view",
-        ":object",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "x_core_info",
-    srcs = glob([
-        "lib/Target/XCore/TargetInfo/*.c",
-        "lib/Target/XCore/TargetInfo/*.cpp",
-        "lib/Target/XCore/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/XCore/TargetInfo/*.h",
-        "include/llvm/Target/XCore/TargetInfo/*.def",
-        "include/llvm/Target/XCore/TargetInfo/*.inc",
-        "lib/Target/XCore/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/XCore"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "system_z_code_gen",
-    srcs = glob([
-        "lib/Target/SystemZ/*.c",
-        "lib/Target/SystemZ/*.cpp",
-        "lib/Target/SystemZ/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/SystemZ/*.h",
-        "include/llvm/Target/SystemZ/*.def",
-        "include/llvm/Target/SystemZ/*.inc",
-        "lib/Target/SystemZ/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
-    deps = [
-        ":analysis",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":scalar",
-        ":selection_dag",
-        ":support",
-        ":system_z_desc",
-        ":system_z_info",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "msp430_code_gen",
-    srcs = glob([
-        "lib/Target/MSP430/*.c",
-        "lib/Target/MSP430/*.cpp",
-        "lib/Target/MSP430/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/MSP430/*.h",
-        "include/llvm/Target/MSP430/*.def",
-        "include/llvm/Target/MSP430/*.inc",
-        "lib/Target/MSP430/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/MSP430"],
-    deps = [
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":mc",
-        ":msp430_desc",
-        ":msp430_info",
-        ":selection_dag",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "execution_engine",
-    srcs = glob([
-        "lib/ExecutionEngine/*.c",
-        "lib/ExecutionEngine/*.cpp",
-        "lib/ExecutionEngine/*.inc",
-        "lib/ExecutionEngine/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ExecutionEngine/*.h",
-        "include/llvm/ExecutionEngine/*.def",
-        "include/llvm/ExecutionEngine/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":core",
-        ":mc",
-        ":object",
-        ":runtime_dyld",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "testing_support",
-    srcs = glob([
-        "lib/Testing/Support/*.c",
-        "lib/Testing/Support/*.cpp",
-        "lib/Testing/Support/*.inc",
-        "lib/Testing/Support/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Testing/Support/*.h",
-        "include/llvm/Testing/Support/*.def",
-        "include/llvm/Testing/Support/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mc_parser",
-    srcs = glob([
-        "lib/MC/MCParser/*.c",
-        "lib/MC/MCParser/*.cpp",
-        "lib/MC/MCParser/*.inc",
-        "lib/MC/MCParser/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/MC/MCParser/*.h",
-        "include/llvm/MC/MCParser/*.def",
-        "include/llvm/MC/MCParser/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "windows_manifest",
-    srcs = glob([
-        "lib/WindowsManifest/*.c",
-        "lib/WindowsManifest/*.cpp",
-        "lib/WindowsManifest/*.inc",
-        "lib/WindowsManifest/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/WindowsManifest/*.h",
-        "include/llvm/WindowsManifest/*.def",
-        "include/llvm/WindowsManifest/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "ir_reader",
-    srcs = glob([
-        "lib/IRReader/*.c",
-        "lib/IRReader/*.cpp",
-        "lib/IRReader/*.inc",
-        "lib/IRReader/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/IRReader/*.h",
-        "include/llvm/IRReader/*.def",
-        "include/llvm/IRReader/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":asm_parser",
-        ":bit_reader",
-        ":config",
-        ":core",
+        ":mc_parser",
         ":support",
     ],
 )
@@ -2800,6 +617,544 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "amdgpu_desc",
+    srcs = glob([
+        "lib/Target/AMDGPU/MCTargetDesc/*.c",
+        "lib/Target/AMDGPU/MCTargetDesc/*.cpp",
+        "lib/Target/AMDGPU/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/MCTargetDesc/*.h",
+        "include/llvm/Target/AMDGPU/MCTargetDesc/*.def",
+        "include/llvm/Target/AMDGPU/MCTargetDesc/*.inc",
+        "lib/Target/AMDGPU/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_info",
+        ":amdgpu_utils",
+        ":binary_format",
+        ":config",
+        ":core",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_disassembler",
+    srcs = glob([
+        "lib/Target/AMDGPU/Disassembler/*.c",
+        "lib/Target/AMDGPU/Disassembler/*.cpp",
+        "lib/Target/AMDGPU/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/Disassembler/*.h",
+        "include/llvm/Target/AMDGPU/Disassembler/*.def",
+        "include/llvm/Target/AMDGPU/Disassembler/*.inc",
+        "lib/Target/AMDGPU/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_desc",
+        ":amdgpu_info",
+        ":amdgpu_utils",
+        ":config",
+        ":mc",
+        ":mc_disassembler",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_info",
+    srcs = glob([
+        "lib/Target/AMDGPU/TargetInfo/*.c",
+        "lib/Target/AMDGPU/TargetInfo/*.cpp",
+        "lib/Target/AMDGPU/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/TargetInfo/*.h",
+        "include/llvm/Target/AMDGPU/TargetInfo/*.def",
+        "include/llvm/Target/AMDGPU/TargetInfo/*.inc",
+        "lib/Target/AMDGPU/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_r600_target_gen",
+        ":amdgpu_target_gen",
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "amdgpu_utils",
+    srcs = glob([
+        "lib/Target/AMDGPU/Utils/*.c",
+        "lib/Target/AMDGPU/Utils/*.cpp",
+        "lib/Target/AMDGPU/Utils/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AMDGPU/Utils/*.h",
+        "include/llvm/Target/AMDGPU/Utils/*.def",
+        "include/llvm/Target/AMDGPU/Utils/*.inc",
+        "lib/Target/AMDGPU/Utils/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
+    deps = [
+        ":amdgpu_r600_target_gen",
+        ":amdgpu_target_gen",
+        ":binary_format",
+        ":config",
+        ":core",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "arc_code_gen",
+    srcs = glob([
+        "lib/Target/ARC/*.c",
+        "lib/Target/ARC/*.cpp",
+        "lib/Target/ARC/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARC/*.h",
+        "include/llvm/Target/ARC/*.def",
+        "include/llvm/Target/ARC/*.inc",
+        "lib/Target/ARC/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARC"],
+    deps = [
+        ":analysis",
+        ":arc_desc",
+        ":arc_info",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "arc_desc",
+    srcs = glob([
+        "lib/Target/ARC/MCTargetDesc/*.c",
+        "lib/Target/ARC/MCTargetDesc/*.cpp",
+        "lib/Target/ARC/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARC/MCTargetDesc/*.h",
+        "include/llvm/Target/ARC/MCTargetDesc/*.def",
+        "include/llvm/Target/ARC/MCTargetDesc/*.inc",
+        "lib/Target/ARC/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARC"],
+    deps = [
+        ":arc_info",
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "arc_disassembler",
+    srcs = glob([
+        "lib/Target/ARC/Disassembler/*.c",
+        "lib/Target/ARC/Disassembler/*.cpp",
+        "lib/Target/ARC/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARC/Disassembler/*.h",
+        "include/llvm/Target/ARC/Disassembler/*.def",
+        "include/llvm/Target/ARC/Disassembler/*.inc",
+        "lib/Target/ARC/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARC"],
+    deps = [
+        ":arc_info",
+        ":config",
+        ":mc_disassembler",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "arc_info",
+    srcs = glob([
+        "lib/Target/ARC/TargetInfo/*.c",
+        "lib/Target/ARC/TargetInfo/*.cpp",
+        "lib/Target/ARC/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARC/TargetInfo/*.h",
+        "include/llvm/Target/ARC/TargetInfo/*.def",
+        "include/llvm/Target/ARC/TargetInfo/*.inc",
+        "lib/Target/ARC/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARC"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "arm_asm_parser",
+    srcs = glob([
+        "lib/Target/ARM/AsmParser/*.c",
+        "lib/Target/ARM/AsmParser/*.cpp",
+        "lib/Target/ARM/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARM/AsmParser/*.h",
+        "include/llvm/Target/ARM/AsmParser/*.def",
+        "include/llvm/Target/ARM/AsmParser/*.inc",
+        "lib/Target/ARM/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
+    deps = [
+        ":arm_desc",
+        ":arm_info",
+        ":arm_utils",
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "arm_code_gen",
+    srcs = glob([
+        "lib/Target/ARM/*.c",
+        "lib/Target/ARM/*.cpp",
+        "lib/Target/ARM/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARM/*.h",
+        "include/llvm/Target/ARM/*.def",
+        "include/llvm/Target/ARM/*.inc",
+        "lib/Target/ARM/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
+    deps = [
+        ":analysis",
+        ":arm_desc",
+        ":arm_info",
+        ":arm_utils",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":global_i_sel",
+        ":mc",
+        ":scalar",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "arm_desc",
+    srcs = glob([
+        "lib/Target/ARM/MCTargetDesc/*.c",
+        "lib/Target/ARM/MCTargetDesc/*.cpp",
+        "lib/Target/ARM/MCTargetDesc/*.inc",
+        "lib/Target/ARM/*.h",
+        "include/llvm/CodeGen/GlobalISel/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARM/MCTargetDesc/*.h",
+        "include/llvm/Target/ARM/MCTargetDesc/*.def",
+        "include/llvm/Target/ARM/MCTargetDesc/*.inc",
+        "lib/Target/ARM/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
+    deps = [
+        ":arm_info",
+        ":arm_target_gen",
+        ":arm_utils",
+        ":attributes_gen",
+        ":config",
+        ":intrinsic_enums_gen",
+        ":intrinsics_impl_gen",
+        ":mc",
+        ":mc_disassembler",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "arm_disassembler",
+    srcs = glob([
+        "lib/Target/ARM/Disassembler/*.c",
+        "lib/Target/ARM/Disassembler/*.cpp",
+        "lib/Target/ARM/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARM/Disassembler/*.h",
+        "include/llvm/Target/ARM/Disassembler/*.def",
+        "include/llvm/Target/ARM/Disassembler/*.inc",
+        "lib/Target/ARM/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
+    deps = [
+        ":arm_desc",
+        ":arm_info",
+        ":arm_utils",
+        ":config",
+        ":mc_disassembler",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "arm_info",
+    srcs = glob([
+        "lib/Target/ARM/TargetInfo/*.c",
+        "lib/Target/ARM/TargetInfo/*.cpp",
+        "lib/Target/ARM/TargetInfo/*.inc",
+        "lib/Target/ARM/MCTargetDesc/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARM/TargetInfo/*.h",
+        "include/llvm/Target/ARM/TargetInfo/*.def",
+        "include/llvm/Target/ARM/TargetInfo/*.inc",
+        "lib/Target/ARM/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
+    deps = [
+        ":arm_target_gen",
+        ":config",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "arm_utils",
+    srcs = glob([
+        "lib/Target/ARM/Utils/*.c",
+        "lib/Target/ARM/Utils/*.cpp",
+        "lib/Target/ARM/Utils/*.inc",
+        "lib/Target/ARM/MCTargetDesc/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/ARM/Utils/*.h",
+        "include/llvm/Target/ARM/Utils/*.def",
+        "include/llvm/Target/ARM/Utils/*.inc",
+        "lib/Target/ARM/Utils/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
+    deps = [
+        ":arm_target_gen",
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "avr_asm_parser",
+    srcs = glob([
+        "lib/Target/AVR/AsmParser/*.c",
+        "lib/Target/AVR/AsmParser/*.cpp",
+        "lib/Target/AVR/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AVR/AsmParser/*.h",
+        "include/llvm/Target/AVR/AsmParser/*.def",
+        "include/llvm/Target/AVR/AsmParser/*.inc",
+        "lib/Target/AVR/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
+    deps = [
+        ":avr_desc",
+        ":avr_info",
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "avr_code_gen",
+    srcs = glob([
+        "lib/Target/AVR/*.c",
+        "lib/Target/AVR/*.cpp",
+        "lib/Target/AVR/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AVR/*.h",
+        "include/llvm/Target/AVR/*.def",
+        "include/llvm/Target/AVR/*.inc",
+        "lib/Target/AVR/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
+    deps = [
+        ":asm_printer",
+        ":avr_desc",
+        ":avr_info",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":selection_dag",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "avr_desc",
+    srcs = glob([
+        "lib/Target/AVR/MCTargetDesc/*.c",
+        "lib/Target/AVR/MCTargetDesc/*.cpp",
+        "lib/Target/AVR/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AVR/MCTargetDesc/*.h",
+        "include/llvm/Target/AVR/MCTargetDesc/*.def",
+        "include/llvm/Target/AVR/MCTargetDesc/*.inc",
+        "lib/Target/AVR/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
+    deps = [
+        ":avr_info",
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "avr_disassembler",
+    srcs = glob([
+        "lib/Target/AVR/Disassembler/*.c",
+        "lib/Target/AVR/Disassembler/*.cpp",
+        "lib/Target/AVR/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AVR/Disassembler/*.h",
+        "include/llvm/Target/AVR/Disassembler/*.def",
+        "include/llvm/Target/AVR/Disassembler/*.inc",
+        "lib/Target/AVR/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
+    deps = [
+        ":avr_info",
+        ":config",
+        ":mc_disassembler",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "avr_info",
+    srcs = glob([
+        "lib/Target/AVR/TargetInfo/*.c",
+        "lib/Target/AVR/TargetInfo/*.cpp",
+        "lib/Target/AVR/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/AVR/TargetInfo/*.h",
+        "include/llvm/Target/AVR/TargetInfo/*.def",
+        "include/llvm/Target/AVR/TargetInfo/*.inc",
+        "lib/Target/AVR/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "aggressive_inst_combine",
+    srcs = glob([
+        "lib/Transforms/AggressiveInstCombine/*.c",
+        "lib/Transforms/AggressiveInstCombine/*.cpp",
+        "lib/Transforms/AggressiveInstCombine/*.inc",
+        "lib/Transforms/AggressiveInstCombine/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/AggressiveInstCombine/*.h",
+        "include/llvm/Transforms/AggressiveInstCombine/*.def",
+        "include/llvm/Transforms/AggressiveInstCombine/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "analysis",
+    srcs = glob([
+        "lib/Analysis/*.c",
+        "lib/Analysis/*.cpp",
+        "lib/Analysis/*.inc",
+        "include/llvm/Transforms/Utils/Local.h",
+        "include/llvm/Transforms/Scalar.h",
+        "lib/Analysis/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Analysis/*.h",
+        "include/llvm/Analysis/*.def",
+        "include/llvm/Analysis/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":binary_format",
+        ":config",
+        ":core",
+        ":object",
+        ":profile_data",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "asm_parser",
+    srcs = glob([
+        "lib/AsmParser/*.c",
+        "lib/AsmParser/*.cpp",
+        "lib/AsmParser/*.inc",
+        "lib/AsmParser/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/AsmParser/*.h",
+        "include/llvm/AsmParser/*.def",
+        "include/llvm/AsmParser/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":binary_format",
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
 cc_library(
     name = "asm_printer",
     srcs = glob([
@@ -2833,24 +1188,302 @@ cc_library(
 )
 
 cc_library(
-    name = "x_core_disassembler",
+    name = "bpf_asm_parser",
     srcs = glob([
-        "lib/Target/XCore/Disassembler/*.c",
-        "lib/Target/XCore/Disassembler/*.cpp",
-        "lib/Target/XCore/Disassembler/*.inc",
+        "lib/Target/BPF/AsmParser/*.c",
+        "lib/Target/BPF/AsmParser/*.cpp",
+        "lib/Target/BPF/AsmParser/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/Target/XCore/Disassembler/*.h",
-        "include/llvm/Target/XCore/Disassembler/*.def",
-        "include/llvm/Target/XCore/Disassembler/*.inc",
-        "lib/Target/XCore/Disassembler/*.h",
+        "include/llvm/Target/BPF/AsmParser/*.h",
+        "include/llvm/Target/BPF/AsmParser/*.def",
+        "include/llvm/Target/BPF/AsmParser/*.inc",
+        "lib/Target/BPF/AsmParser/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/XCore"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
     deps = [
+        ":bpf_desc",
+        ":bpf_info",
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "bpf_code_gen",
+    srcs = glob([
+        "lib/Target/BPF/*.c",
+        "lib/Target/BPF/*.cpp",
+        "lib/Target/BPF/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/BPF/*.h",
+        "include/llvm/Target/BPF/*.def",
+        "include/llvm/Target/BPF/*.inc",
+        "lib/Target/BPF/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
+    deps = [
+        ":asm_printer",
+        ":bpf_desc",
+        ":bpf_info",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":selection_dag",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "bpf_desc",
+    srcs = glob([
+        "lib/Target/BPF/MCTargetDesc/*.c",
+        "lib/Target/BPF/MCTargetDesc/*.cpp",
+        "lib/Target/BPF/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/BPF/MCTargetDesc/*.h",
+        "include/llvm/Target/BPF/MCTargetDesc/*.def",
+        "include/llvm/Target/BPF/MCTargetDesc/*.inc",
+        "lib/Target/BPF/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
+    deps = [
+        ":bpf_info",
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "bpf_disassembler",
+    srcs = glob([
+        "lib/Target/BPF/Disassembler/*.c",
+        "lib/Target/BPF/Disassembler/*.cpp",
+        "lib/Target/BPF/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/BPF/Disassembler/*.h",
+        "include/llvm/Target/BPF/Disassembler/*.def",
+        "include/llvm/Target/BPF/Disassembler/*.inc",
+        "lib/Target/BPF/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
+    deps = [
+        ":bpf_info",
         ":config",
         ":mc_disassembler",
         ":support",
-        ":x_core_info",
+    ],
+)
+
+cc_library(
+    name = "bpf_info",
+    srcs = glob([
+        "lib/Target/BPF/TargetInfo/*.c",
+        "lib/Target/BPF/TargetInfo/*.cpp",
+        "lib/Target/BPF/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/BPF/TargetInfo/*.h",
+        "include/llvm/Target/BPF/TargetInfo/*.def",
+        "include/llvm/Target/BPF/TargetInfo/*.inc",
+        "lib/Target/BPF/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "binary_format",
+    srcs = glob([
+        "lib/BinaryFormat/*.c",
+        "lib/BinaryFormat/*.cpp",
+        "lib/BinaryFormat/*.inc",
+        "lib/BinaryFormat/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/BinaryFormat/*.h",
+        "include/llvm/BinaryFormat/*.def",
+        "include/llvm/BinaryFormat/*.inc",
+        "include/llvm/BinaryFormat/ELFRelocs/*.def",
+        "include/llvm/BinaryFormat/WasmRelocs/*.def",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "bit_reader",
+    srcs = glob([
+        "lib/Bitcode/Reader/*.c",
+        "lib/Bitcode/Reader/*.cpp",
+        "lib/Bitcode/Reader/*.inc",
+        "lib/Bitcode/Reader/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Bitcode/Reader/*.h",
+        "include/llvm/Bitcode/Reader/*.def",
+        "include/llvm/Bitcode/Reader/*.inc",
+        "include/llvm/Bitcode/BitstreamReader.h",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "bit_writer",
+    srcs = glob([
+        "lib/Bitcode/Writer/*.c",
+        "lib/Bitcode/Writer/*.cpp",
+        "lib/Bitcode/Writer/*.inc",
+        "lib/Bitcode/Writer/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Bitcode/Writer/*.h",
+        "include/llvm/Bitcode/Writer/*.def",
+        "include/llvm/Bitcode/Writer/*.inc",
+        "include/llvm/Bitcode/BitcodeWriter.h",
+        "include/llvm/Bitcode/BitcodeWriterPass.h",
+        "include/llvm/Bitcode/BitstreamWriter.h",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":mc",
+        ":object",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "code_gen",
+    srcs = glob([
+        "lib/CodeGen/*.c",
+        "lib/CodeGen/*.cpp",
+        "lib/CodeGen/*.inc",
+        "lib/CodeGen/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/CodeGen/*.h",
+        "include/llvm/CodeGen/*.def",
+        "include/llvm/CodeGen/*.inc",
+        "include/llvm/CodeGen/**/*.h",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":bit_reader",
+        ":bit_writer",
+        ":config",
+        ":core",
+        ":instrumentation",
+        ":mc",
+        ":profile_data",
+        ":scalar",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "core",
+    srcs = glob([
+        "lib/IR/*.c",
+        "lib/IR/*.cpp",
+        "lib/IR/*.inc",
+        "include/llvm/Analysis/*.h",
+        "include/llvm/Bitcode/BitcodeReader.h",
+        "include/llvm/Bitcode/BitCodes.h",
+        "include/llvm/Bitcode/LLVMBitCodes.h",
+        "include/llvm/CodeGen/MachineValueType.h",
+        "include/llvm/CodeGen/ValueTypes.h",
+        "lib/IR/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/IR/*.h",
+        "include/llvm/IR/*.def",
+        "include/llvm/IR/*.inc",
+        "include/llvm/*.h",
+        "include/llvm/Analysis/*.def",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":attributes_compat_gen",
+        ":attributes_gen",
+        ":binary_format",
+        ":config",
+        ":intrinsic_enums_gen",
+        ":intrinsics_impl_gen",
+        ":remarks",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "coroutines",
+    srcs = glob([
+        "lib/Transforms/Coroutines/*.c",
+        "lib/Transforms/Coroutines/*.cpp",
+        "lib/Transforms/Coroutines/*.inc",
+        "lib/Transforms/Coroutines/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/Coroutines/*.h",
+        "include/llvm/Transforms/Coroutines/*.def",
+        "include/llvm/Transforms/Coroutines/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":ipo",
+        ":scalar",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "coverage",
+    srcs = glob([
+        "lib/ProfileData/Coverage/*.c",
+        "lib/ProfileData/Coverage/*.cpp",
+        "lib/ProfileData/Coverage/*.inc",
+        "lib/ProfileData/Coverage/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ProfileData/Coverage/*.h",
+        "include/llvm/ProfileData/Coverage/*.def",
+        "include/llvm/ProfileData/Coverage/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":object",
+        ":profile_data",
+        ":support",
     ],
 )
 
@@ -2876,6 +1509,212 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "debug_info_dwarf",
+    srcs = glob([
+        "lib/DebugInfo/DWARF/*.c",
+        "lib/DebugInfo/DWARF/*.cpp",
+        "lib/DebugInfo/DWARF/*.inc",
+        "lib/DebugInfo/DWARF/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/DebugInfo/DWARF/*.h",
+        "include/llvm/DebugInfo/DWARF/*.def",
+        "include/llvm/DebugInfo/DWARF/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":binary_format",
+        ":config",
+        ":mc",
+        ":object",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "debug_info_msf",
+    srcs = glob([
+        "lib/DebugInfo/MSF/*.c",
+        "lib/DebugInfo/MSF/*.cpp",
+        "lib/DebugInfo/MSF/*.inc",
+        "lib/DebugInfo/MSF/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/DebugInfo/MSF/*.h",
+        "include/llvm/DebugInfo/MSF/*.def",
+        "include/llvm/DebugInfo/MSF/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "debug_info_pdb",
+    srcs = glob([
+        "lib/DebugInfo/PDB/*.c",
+        "lib/DebugInfo/PDB/*.cpp",
+        "lib/DebugInfo/PDB/*.inc",
+        "lib/DebugInfo/PDB/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/DebugInfo/PDB/*.h",
+        "include/llvm/DebugInfo/PDB/*.def",
+        "include/llvm/DebugInfo/PDB/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":debug_info_code_view",
+        ":debug_info_msf",
+        ":object",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "demangle",
+    srcs = glob([
+        "lib/Demangle/*.c",
+        "lib/Demangle/*.cpp",
+        "lib/Demangle/*.inc",
+        "lib/Demangle/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Demangle/*.h",
+        "include/llvm/Demangle/*.def",
+        "include/llvm/Demangle/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [":config"],
+)
+
+cc_library(
+    name = "dlltool_driver",
+    srcs = glob([
+        "lib/ToolDrivers/llvm-dlltool/*.c",
+        "lib/ToolDrivers/llvm-dlltool/*.cpp",
+        "lib/ToolDrivers/llvm-dlltool/*.inc",
+        "lib/ToolDrivers/llvm-dlltool/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ToolDrivers/llvm-dlltool/*.h",
+        "include/llvm/ToolDrivers/llvm-dlltool/*.def",
+        "include/llvm/ToolDrivers/llvm-dlltool/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":object",
+        ":option",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "execution_engine",
+    srcs = glob([
+        "lib/ExecutionEngine/*.c",
+        "lib/ExecutionEngine/*.cpp",
+        "lib/ExecutionEngine/*.inc",
+        "lib/ExecutionEngine/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ExecutionEngine/*.h",
+        "include/llvm/ExecutionEngine/*.def",
+        "include/llvm/ExecutionEngine/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":mc",
+        ":object",
+        ":runtime_dyld",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "fuzz_mutate",
+    srcs = glob([
+        "lib/FuzzMutate/*.c",
+        "lib/FuzzMutate/*.cpp",
+        "lib/FuzzMutate/*.inc",
+        "lib/FuzzMutate/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/FuzzMutate/*.h",
+        "include/llvm/FuzzMutate/*.def",
+        "include/llvm/FuzzMutate/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":bit_reader",
+        ":bit_writer",
+        ":config",
+        ":core",
+        ":scalar",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "global_i_sel",
+    srcs = glob([
+        "lib/CodeGen/GlobalISel/*.c",
+        "lib/CodeGen/GlobalISel/*.cpp",
+        "lib/CodeGen/GlobalISel/*.inc",
+        "lib/CodeGen/GlobalISel/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/CodeGen/GlobalISel/*.h",
+        "include/llvm/CodeGen/GlobalISel/*.def",
+        "include/llvm/CodeGen/GlobalISel/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "hexagon_asm_parser",
+    srcs = glob([
+        "lib/Target/Hexagon/AsmParser/*.c",
+        "lib/Target/Hexagon/AsmParser/*.cpp",
+        "lib/Target/Hexagon/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Hexagon/AsmParser/*.h",
+        "include/llvm/Target/Hexagon/AsmParser/*.def",
+        "include/llvm/Target/Hexagon/AsmParser/*.inc",
+        "lib/Target/Hexagon/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Hexagon"],
+    deps = [
+        ":config",
+        ":hexagon_desc",
+        ":hexagon_info",
+        ":mc",
+        ":mc_parser",
+        ":support",
+    ],
+)
+
 cc_library(
     name = "hexagon_code_gen",
     srcs = glob([
@@ -2910,501 +1749,67 @@ cc_library(
 )
 
 cc_library(
-    name = "mca",
+    name = "hexagon_desc",
     srcs = glob([
-        "lib/MCA/*.c",
-        "lib/MCA/*.cpp",
-        "lib/MCA/*.inc",
-        "lib/MCA/*.h",
+        "lib/Target/Hexagon/MCTargetDesc/*.c",
+        "lib/Target/Hexagon/MCTargetDesc/*.cpp",
+        "lib/Target/Hexagon/MCTargetDesc/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/MCA/*.h",
-        "include/llvm/MCA/*.def",
-        "include/llvm/MCA/*.inc",
+        "include/llvm/Target/Hexagon/MCTargetDesc/*.h",
+        "include/llvm/Target/Hexagon/MCTargetDesc/*.def",
+        "include/llvm/Target/Hexagon/MCTargetDesc/*.inc",
+        "lib/Target/Hexagon/MCTargetDesc/*.h",
     ]),
-    copts = llvm_copts,
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Hexagon"],
     deps = [
         ":config",
+        ":hexagon_info",
         ":mc",
         ":support",
     ],
 )
 
 cc_library(
-    name = "x86_code_gen",
+    name = "hexagon_disassembler",
     srcs = glob([
-        "lib/Target/X86/*.c",
-        "lib/Target/X86/*.cpp",
-        "lib/Target/X86/*.inc",
+        "lib/Target/Hexagon/Disassembler/*.c",
+        "lib/Target/Hexagon/Disassembler/*.cpp",
+        "lib/Target/Hexagon/Disassembler/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/Target/X86/*.h",
-        "include/llvm/Target/X86/*.def",
-        "include/llvm/Target/X86/*.inc",
-        "lib/Target/X86/*.h",
+        "include/llvm/Target/Hexagon/Disassembler/*.h",
+        "include/llvm/Target/Hexagon/Disassembler/*.def",
+        "include/llvm/Target/Hexagon/Disassembler/*.inc",
+        "lib/Target/Hexagon/Disassembler/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
-    deps = [
-        ":analysis",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":global_i_sel",
-        ":mc",
-        ":profile_data",
-        ":selection_dag",
-        ":support",
-        ":target",
-        ":x86_defs",
-        ":x86_desc",
-        ":x86_info",
-        ":x86_utils",
-    ],
-)
-
-cc_library(
-    name = "object",
-    srcs = glob([
-        "lib/Object/*.c",
-        "lib/Object/*.cpp",
-        "lib/Object/*.inc",
-        "lib/Object/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Object/*.h",
-        "include/llvm/Object/*.def",
-        "include/llvm/Object/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":binary_format",
-        ":bit_reader",
-        ":config",
-        ":core",
-        ":mc",
-        ":mc_parser",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "system_z_info",
-    srcs = glob([
-        "lib/Target/SystemZ/TargetInfo/*.c",
-        "lib/Target/SystemZ/TargetInfo/*.cpp",
-        "lib/Target/SystemZ/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/SystemZ/TargetInfo/*.h",
-        "include/llvm/Target/SystemZ/TargetInfo/*.def",
-        "include/llvm/Target/SystemZ/TargetInfo/*.inc",
-        "lib/Target/SystemZ/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "bit_writer",
-    srcs = glob([
-        "lib/Bitcode/Writer/*.c",
-        "lib/Bitcode/Writer/*.cpp",
-        "lib/Bitcode/Writer/*.inc",
-        "lib/Bitcode/Writer/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Bitcode/Writer/*.h",
-        "include/llvm/Bitcode/Writer/*.def",
-        "include/llvm/Bitcode/Writer/*.inc",
-        "include/llvm/Bitcode/BitcodeWriter.h",
-        "include/llvm/Bitcode/BitcodeWriterPass.h",
-        "include/llvm/Bitcode/BitstreamWriter.h",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":mc",
-        ":object",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mips_code_gen",
-    srcs = glob([
-        "lib/Target/Mips/*.c",
-        "lib/Target/Mips/*.cpp",
-        "lib/Target/Mips/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Mips/*.h",
-        "include/llvm/Target/Mips/*.def",
-        "include/llvm/Target/Mips/*.inc",
-        "lib/Target/Mips/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Mips"],
-    deps = [
-        ":analysis",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":global_i_sel",
-        ":mc",
-        ":mips_desc",
-        ":mips_info",
-        ":selection_dag",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "instrumentation",
-    srcs = glob([
-        "lib/Transforms/Instrumentation/*.c",
-        "lib/Transforms/Instrumentation/*.cpp",
-        "lib/Transforms/Instrumentation/*.inc",
-        "lib/Transforms/Instrumentation/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Transforms/Instrumentation/*.h",
-        "include/llvm/Transforms/Instrumentation/*.def",
-        "include/llvm/Transforms/Instrumentation/*.inc",
-        "include/llvm/Transforms/GCOVProfiler.h",
-        "include/llvm/Transforms/Instrumentation.h",
-        "include/llvm/Transforms/InstrProfiling.h",
-        "include/llvm/Transforms/PGOInstrumentation.h",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":mc",
-        ":profile_data",
-        ":support",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "bpf_disassembler",
-    srcs = glob([
-        "lib/Target/BPF/Disassembler/*.c",
-        "lib/Target/BPF/Disassembler/*.cpp",
-        "lib/Target/BPF/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/BPF/Disassembler/*.h",
-        "include/llvm/Target/BPF/Disassembler/*.def",
-        "include/llvm/Target/BPF/Disassembler/*.inc",
-        "lib/Target/BPF/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
-    deps = [
-        ":bpf_info",
-        ":config",
-        ":mc_disassembler",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "aarch64_asm_parser",
-    srcs = glob([
-        "lib/Target/AArch64/AsmParser/*.c",
-        "lib/Target/AArch64/AsmParser/*.cpp",
-        "lib/Target/AArch64/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AArch64/AsmParser/*.h",
-        "include/llvm/Target/AArch64/AsmParser/*.def",
-        "include/llvm/Target/AArch64/AsmParser/*.inc",
-        "lib/Target/AArch64/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
-    deps = [
-        ":aarch64_desc",
-        ":aarch64_info",
-        ":aarch64_utils",
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "table_gen",
-    srcs = glob([
-        "lib/TableGen/*.c",
-        "lib/TableGen/*.cpp",
-        "lib/TableGen/*.inc",
-        "include/llvm/CodeGen/*.h",
-        "lib/TableGen/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/TableGen/*.h",
-        "include/llvm/TableGen/*.def",
-        "include/llvm/TableGen/*.inc",
-        "include/llvm/Target/*.def",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "x86_desc",
-    srcs = glob([
-        "lib/Target/X86/MCTargetDesc/*.c",
-        "lib/Target/X86/MCTargetDesc/*.cpp",
-        "lib/Target/X86/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/X86/MCTargetDesc/*.h",
-        "include/llvm/Target/X86/MCTargetDesc/*.def",
-        "include/llvm/Target/X86/MCTargetDesc/*.inc",
-        "lib/Target/X86/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Hexagon"],
     deps = [
         ":config",
+        ":hexagon_desc",
+        ":hexagon_info",
         ":mc",
         ":mc_disassembler",
-        ":object",
-        ":support",
-        ":x86_info",
-        ":x86_utils",
-    ],
-)
-
-cc_library(
-    name = "powerpc_asm_parser",
-    srcs = glob([
-        "lib/Target/PowerPC/AsmParser/*.c",
-        "lib/Target/PowerPC/AsmParser/*.cpp",
-        "lib/Target/PowerPC/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/PowerPC/AsmParser/*.h",
-        "include/llvm/Target/PowerPC/AsmParser/*.def",
-        "include/llvm/Target/PowerPC/AsmParser/*.inc",
-        "lib/Target/PowerPC/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
-    deps = [
-        ":config",
-        ":mc",
-        ":mc_parser",
-        ":powerpc_desc",
-        ":powerpc_info",
         ":support",
     ],
 )
 
 cc_library(
-    name = "avr_disassembler",
+    name = "hexagon_info",
     srcs = glob([
-        "lib/Target/AVR/Disassembler/*.c",
-        "lib/Target/AVR/Disassembler/*.cpp",
-        "lib/Target/AVR/Disassembler/*.inc",
+        "lib/Target/Hexagon/TargetInfo/*.c",
+        "lib/Target/Hexagon/TargetInfo/*.cpp",
+        "lib/Target/Hexagon/TargetInfo/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/Target/AVR/Disassembler/*.h",
-        "include/llvm/Target/AVR/Disassembler/*.def",
-        "include/llvm/Target/AVR/Disassembler/*.inc",
-        "lib/Target/AVR/Disassembler/*.h",
+        "include/llvm/Target/Hexagon/TargetInfo/*.h",
+        "include/llvm/Target/Hexagon/TargetInfo/*.def",
+        "include/llvm/Target/Hexagon/TargetInfo/*.inc",
+        "lib/Target/Hexagon/TargetInfo/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AVR"],
-    deps = [
-        ":avr_info",
-        ":config",
-        ":mc_disassembler",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "msp430_disassembler",
-    srcs = glob([
-        "lib/Target/MSP430/Disassembler/*.c",
-        "lib/Target/MSP430/Disassembler/*.cpp",
-        "lib/Target/MSP430/Disassembler/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/MSP430/Disassembler/*.h",
-        "include/llvm/Target/MSP430/Disassembler/*.def",
-        "include/llvm/Target/MSP430/Disassembler/*.inc",
-        "lib/Target/MSP430/Disassembler/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/MSP430"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Hexagon"],
     deps = [
         ":config",
-        ":mc_disassembler",
-        ":msp430_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "transform_utils",
-    srcs = glob([
-        "lib/Transforms/Utils/*.c",
-        "lib/Transforms/Utils/*.cpp",
-        "lib/Transforms/Utils/*.inc",
-        "include/llvm/Transforms/IPO.h",
-        "include/llvm/Transforms/Scalar.h",
-        "lib/Transforms/Utils/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Transforms/Utils/*.h",
-        "include/llvm/Transforms/Utils/*.def",
-        "include/llvm/Transforms/Utils/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "profile_data",
-    srcs = glob([
-        "lib/ProfileData/*.c",
-        "lib/ProfileData/*.cpp",
-        "lib/ProfileData/*.inc",
-        "lib/ProfileData/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ProfileData/*.h",
-        "include/llvm/ProfileData/*.def",
-        "include/llvm/ProfileData/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":core",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "inst_combine",
-    srcs = glob([
-        "lib/Transforms/InstCombine/*.c",
-        "lib/Transforms/InstCombine/*.cpp",
-        "lib/Transforms/InstCombine/*.inc",
-        "lib/Transforms/InstCombine/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Transforms/InstCombine/*.h",
-        "include/llvm/Transforms/InstCombine/*.def",
-        "include/llvm/Transforms/InstCombine/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":instcombine_transforms_gen",
-        ":support",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "nvptx_code_gen",
-    srcs = glob([
-        "lib/Target/NVPTX/*.c",
-        "lib/Target/NVPTX/*.cpp",
-        "lib/Target/NVPTX/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/NVPTX/*.h",
-        "include/llvm/Target/NVPTX/*.def",
-        "include/llvm/Target/NVPTX/*.inc",
-        "lib/Target/NVPTX/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/NVPTX"],
-    deps = [
-        ":analysis",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":ipo",
-        ":mc",
-        ":nvptx_desc",
-        ":nvptx_info",
-        ":scalar",
-        ":selection_dag",
-        ":support",
-        ":target",
-        ":transform_utils",
-        ":vectorize",
-    ],
-)
-
-cc_library(
-    name = "dlltool_driver",
-    srcs = glob([
-        "lib/ToolDrivers/llvm-dlltool/*.c",
-        "lib/ToolDrivers/llvm-dlltool/*.cpp",
-        "lib/ToolDrivers/llvm-dlltool/*.inc",
-        "lib/ToolDrivers/llvm-dlltool/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ToolDrivers/llvm-dlltool/*.h",
-        "include/llvm/ToolDrivers/llvm-dlltool/*.def",
-        "include/llvm/ToolDrivers/llvm-dlltool/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":object",
-        ":option",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "target",
-    srcs = glob([
-        "lib/Target/*.c",
-        "lib/Target/*.cpp",
-        "lib/Target/*.inc",
-        "include/llvm/CodeGen/*.h",
-        "include/llvm-c/Initialization.h",
-        "include/llvm-c/Target.h",
-        "lib/Target/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/*.h",
-        "include/llvm/Target/*.def",
-        "include/llvm/Target/*.inc",
-        "include/llvm/CodeGen/*.def",
-        "include/llvm/CodeGen/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":analysis",
-        ":config",
-        ":core",
-        ":mc",
         ":support",
     ],
 )
@@ -3447,172 +1852,77 @@ cc_library(
 )
 
 cc_library(
-    name = "hexagon_desc",
+    name = "ir_reader",
     srcs = glob([
-        "lib/Target/Hexagon/MCTargetDesc/*.c",
-        "lib/Target/Hexagon/MCTargetDesc/*.cpp",
-        "lib/Target/Hexagon/MCTargetDesc/*.inc",
+        "lib/IRReader/*.c",
+        "lib/IRReader/*.cpp",
+        "lib/IRReader/*.inc",
+        "lib/IRReader/*.h",
     ]),
     hdrs = glob([
-        "include/llvm/Target/Hexagon/MCTargetDesc/*.h",
-        "include/llvm/Target/Hexagon/MCTargetDesc/*.def",
-        "include/llvm/Target/Hexagon/MCTargetDesc/*.inc",
-        "lib/Target/Hexagon/MCTargetDesc/*.h",
+        "include/llvm/IRReader/*.h",
+        "include/llvm/IRReader/*.def",
+        "include/llvm/IRReader/*.inc",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Hexagon"],
+    copts = llvm_copts,
     deps = [
+        ":asm_parser",
+        ":bit_reader",
         ":config",
-        ":hexagon_info",
+        ":core",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "inst_combine",
+    srcs = glob([
+        "lib/Transforms/InstCombine/*.c",
+        "lib/Transforms/InstCombine/*.cpp",
+        "lib/Transforms/InstCombine/*.inc",
+        "lib/Transforms/InstCombine/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/InstCombine/*.h",
+        "include/llvm/Transforms/InstCombine/*.def",
+        "include/llvm/Transforms/InstCombine/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":instcombine_transforms_gen",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "instrumentation",
+    srcs = glob([
+        "lib/Transforms/Instrumentation/*.c",
+        "lib/Transforms/Instrumentation/*.cpp",
+        "lib/Transforms/Instrumentation/*.inc",
+        "lib/Transforms/Instrumentation/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/Instrumentation/*.h",
+        "include/llvm/Transforms/Instrumentation/*.def",
+        "include/llvm/Transforms/Instrumentation/*.inc",
+        "include/llvm/Transforms/GCOVProfiler.h",
+        "include/llvm/Transforms/Instrumentation.h",
+        "include/llvm/Transforms/InstrProfiling.h",
+        "include/llvm/Transforms/PGOInstrumentation.h",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
         ":mc",
+        ":profile_data",
         ":support",
-    ],
-)
-
-cc_library(
-    name = "option",
-    srcs = glob([
-        "lib/Option/*.c",
-        "lib/Option/*.cpp",
-        "lib/Option/*.inc",
-        "lib/Option/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Option/*.h",
-        "include/llvm/Option/*.def",
-        "include/llvm/Option/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "mcjit",
-    srcs = glob([
-        "lib/ExecutionEngine/MCJIT/*.c",
-        "lib/ExecutionEngine/MCJIT/*.cpp",
-        "lib/ExecutionEngine/MCJIT/*.inc",
-        "lib/ExecutionEngine/MCJIT/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ExecutionEngine/MCJIT/*.h",
-        "include/llvm/ExecutionEngine/MCJIT/*.def",
-        "include/llvm/ExecutionEngine/MCJIT/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":core",
-        ":execution_engine",
-        ":object",
-        ":runtime_dyld",
-        ":support",
-        ":target",
-    ],
-)
-
-cc_library(
-    name = "debug_info_msf",
-    srcs = glob([
-        "lib/DebugInfo/MSF/*.c",
-        "lib/DebugInfo/MSF/*.cpp",
-        "lib/DebugInfo/MSF/*.inc",
-        "lib/DebugInfo/MSF/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/DebugInfo/MSF/*.h",
-        "include/llvm/DebugInfo/MSF/*.def",
-        "include/llvm/DebugInfo/MSF/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "passes",
-    srcs = glob([
-        "lib/Passes/*.c",
-        "lib/Passes/*.cpp",
-        "lib/Passes/*.inc",
-        "lib/Passes/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Passes/*.h",
-        "include/llvm/Passes/*.def",
-        "include/llvm/Passes/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":aggressive_inst_combine",
-        ":analysis",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":inst_combine",
-        ":instrumentation",
-        ":ipo",
-        ":scalar",
-        ":support",
-        ":target",
-        ":transform_utils",
-        ":vectorize",
-    ],
-)
-
-cc_library(
-    name = "linker",
-    srcs = glob([
-        "lib/Linker/*.c",
-        "lib/Linker/*.cpp",
-        "lib/Linker/*.inc",
-        "lib/Linker/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Linker/*.h",
-        "include/llvm/Linker/*.def",
-        "include/llvm/Linker/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":core",
-        ":support",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "scalar",
-    srcs = glob([
-        "lib/Transforms/Scalar/*.c",
-        "lib/Transforms/Scalar/*.cpp",
-        "lib/Transforms/Scalar/*.inc",
-        "include/llvm-c/Transforms/Scalar.h",
-        "include/llvm/Transforms/Scalar.h",
-        "include/llvm/Target/TargetMachine.h",
-        "lib/Transforms/Scalar/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Transforms/Scalar/*.h",
-        "include/llvm/Transforms/Scalar/*.def",
-        "include/llvm/Transforms/Scalar/*.inc",
-        "include/llvm/Transforms/IPO.h",
-        "include/llvm/Transforms/IPO/SCCP.h",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":aggressive_inst_combine",
-        ":analysis",
-        ":config",
-        ":core",
-        ":inst_combine",
-        ":support",
-        ":target",
         ":transform_utils",
     ],
 )
@@ -3641,274 +1951,27 @@ cc_library(
 )
 
 cc_library(
-    name = "arm_utils",
+    name = "jit_link",
     srcs = glob([
-        "lib/Target/ARM/Utils/*.c",
-        "lib/Target/ARM/Utils/*.cpp",
-        "lib/Target/ARM/Utils/*.inc",
-        "lib/Target/ARM/MCTargetDesc/*.h",
+        "lib/ExecutionEngine/JITLink/*.c",
+        "lib/ExecutionEngine/JITLink/*.cpp",
+        "lib/ExecutionEngine/JITLink/*.inc",
+        "lib/ExecutionEngine/JITLink/*.h",
     ]),
     hdrs = glob([
-        "include/llvm/Target/ARM/Utils/*.h",
-        "include/llvm/Target/ARM/Utils/*.def",
-        "include/llvm/Target/ARM/Utils/*.inc",
-        "lib/Target/ARM/Utils/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
-    deps = [
-        ":arm_target_gen",
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "text_api",
-    srcs = glob([
-        "lib/TextAPI/*.c",
-        "lib/TextAPI/*.cpp",
-        "lib/TextAPI/*.inc",
-        "lib/TextAPI/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/TextAPI/*.h",
-        "include/llvm/TextAPI/*.def",
-        "include/llvm/TextAPI/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":binary_format",
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "bpf_info",
-    srcs = glob([
-        "lib/Target/BPF/TargetInfo/*.c",
-        "lib/Target/BPF/TargetInfo/*.cpp",
-        "lib/Target/BPF/TargetInfo/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/BPF/TargetInfo/*.h",
-        "include/llvm/Target/BPF/TargetInfo/*.def",
-        "include/llvm/Target/BPF/TargetInfo/*.inc",
-        "lib/Target/BPF/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/BPF"],
-    deps = [
-        ":config",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "runtime_dyld",
-    srcs = glob([
-        "lib/ExecutionEngine/RuntimeDyld/*.c",
-        "lib/ExecutionEngine/RuntimeDyld/*.cpp",
-        "lib/ExecutionEngine/RuntimeDyld/*.inc",
-        "include/llvm/ExecutionEngine/JITSymbol.h",
-        "include/llvm/ExecutionEngine/RTDyldMemoryManager.h",
-        "lib/ExecutionEngine/RuntimeDyld/*.h",
-        "lib/ExecutionEngine/RuntimeDyld/Targets/*.h",
-        "lib/ExecutionEngine/RuntimeDyld/Targets/*.cpp",
-        "lib/ExecutionEngine/RuntimeDyld/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ExecutionEngine/RuntimeDyld/*.h",
-        "include/llvm/ExecutionEngine/RuntimeDyld/*.def",
-        "include/llvm/ExecutionEngine/RuntimeDyld/*.inc",
-        "include/llvm/DebugInfo/DIContext.h",
-        "include/llvm/ExecutionEngine/RTDyldMemoryManager.h",
-        "include/llvm/ExecutionEngine/RuntimeDyld*.h",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":config",
-        ":mc",
-        ":mc_disassembler",
-        ":object",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "lanai_code_gen",
-    srcs = glob([
-        "lib/Target/Lanai/*.c",
-        "lib/Target/Lanai/*.cpp",
-        "lib/Target/Lanai/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Lanai/*.h",
-        "include/llvm/Target/Lanai/*.def",
-        "include/llvm/Target/Lanai/*.inc",
-        "lib/Target/Lanai/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
-    deps = [
-        ":analysis",
-        ":asm_printer",
-        ":code_gen",
-        ":config",
-        ":core",
-        ":lanai_asm_parser",
-        ":lanai_desc",
-        ":lanai_info",
-        ":mc",
-        ":selection_dag",
-        ":support",
-        ":target",
-        ":transform_utils",
-    ],
-)
-
-cc_library(
-    name = "lib_driver",
-    srcs = glob([
-        "lib/ToolDrivers/llvm-lib/*.c",
-        "lib/ToolDrivers/llvm-lib/*.cpp",
-        "lib/ToolDrivers/llvm-lib/*.inc",
-        "lib/ToolDrivers/llvm-lib/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/ToolDrivers/llvm-lib/*.h",
-        "include/llvm/ToolDrivers/llvm-lib/*.def",
-        "include/llvm/ToolDrivers/llvm-lib/*.inc",
+        "include/llvm/ExecutionEngine/JITLink/*.h",
+        "include/llvm/ExecutionEngine/JITLink/*.def",
+        "include/llvm/ExecutionEngine/JITLink/*.inc",
     ]),
     copts = llvm_copts,
     deps = [
         ":binary_format",
         ":config",
         ":object",
-        ":option",
         ":support",
     ],
 )
 
-cc_library(
-    name = "aarch64_utils",
-    srcs = glob([
-        "lib/Target/AArch64/Utils/*.c",
-        "lib/Target/AArch64/Utils/*.cpp",
-        "lib/Target/AArch64/Utils/*.inc",
-        "lib/Target/AArch64/MCTargetDesc/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AArch64/Utils/*.h",
-        "include/llvm/Target/AArch64/Utils/*.def",
-        "include/llvm/Target/AArch64/Utils/*.inc",
-        "lib/Target/AArch64/Utils/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
-    deps = [
-        ":aarch64_target_gen",
-        ":config",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "amdgpu_desc",
-    srcs = glob([
-        "lib/Target/AMDGPU/MCTargetDesc/*.c",
-        "lib/Target/AMDGPU/MCTargetDesc/*.cpp",
-        "lib/Target/AMDGPU/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/AMDGPU/MCTargetDesc/*.h",
-        "include/llvm/Target/AMDGPU/MCTargetDesc/*.def",
-        "include/llvm/Target/AMDGPU/MCTargetDesc/*.inc",
-        "lib/Target/AMDGPU/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AMDGPU"],
-    deps = [
-        ":amdgpu_info",
-        ":amdgpu_utils",
-        ":binary_format",
-        ":config",
-        ":core",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "hexagon_asm_parser",
-    srcs = glob([
-        "lib/Target/Hexagon/AsmParser/*.c",
-        "lib/Target/Hexagon/AsmParser/*.cpp",
-        "lib/Target/Hexagon/AsmParser/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/Hexagon/AsmParser/*.h",
-        "include/llvm/Target/Hexagon/AsmParser/*.def",
-        "include/llvm/Target/Hexagon/AsmParser/*.inc",
-        "lib/Target/Hexagon/AsmParser/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Hexagon"],
-    deps = [
-        ":config",
-        ":hexagon_desc",
-        ":hexagon_info",
-        ":mc",
-        ":mc_parser",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "msp430_desc",
-    srcs = glob([
-        "lib/Target/MSP430/MCTargetDesc/*.c",
-        "lib/Target/MSP430/MCTargetDesc/*.cpp",
-        "lib/Target/MSP430/MCTargetDesc/*.inc",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/MSP430/MCTargetDesc/*.h",
-        "include/llvm/Target/MSP430/MCTargetDesc/*.def",
-        "include/llvm/Target/MSP430/MCTargetDesc/*.inc",
-        "lib/Target/MSP430/MCTargetDesc/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/MSP430"],
-    deps = [
-        ":config",
-        ":mc",
-        ":msp430_info",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "nvptx_info",
-    srcs = glob([
-        "lib/Target/NVPTX/TargetInfo/*.c",
-        "lib/Target/NVPTX/TargetInfo/*.cpp",
-        "lib/Target/NVPTX/TargetInfo/*.inc",
-        "lib/Target/NVPTX/MCTargetDesc/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Target/NVPTX/TargetInfo/*.h",
-        "include/llvm/Target/NVPTX/TargetInfo/*.def",
-        "include/llvm/Target/NVPTX/TargetInfo/*.inc",
-        "lib/Target/NVPTX/NVPTX.h",
-        "lib/Target/NVPTX/TargetInfo/*.h",
-    ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/NVPTX"],
-    deps = [
-        "nvptx_target_gen",
-        ":attributes_gen",
-        ":config",
-        ":core",
-        ":support",
-        ":target",
-    ],
-)
-
 cc_library(
     name = "lto",
     srcs = glob([
@@ -3945,6 +2008,329 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "lanai_asm_parser",
+    srcs = glob([
+        "lib/Target/Lanai/AsmParser/*.c",
+        "lib/Target/Lanai/AsmParser/*.cpp",
+        "lib/Target/Lanai/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Lanai/AsmParser/*.h",
+        "include/llvm/Target/Lanai/AsmParser/*.def",
+        "include/llvm/Target/Lanai/AsmParser/*.inc",
+        "lib/Target/Lanai/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
+    deps = [
+        ":config",
+        ":lanai_desc",
+        ":lanai_info",
+        ":mc",
+        ":mc_parser",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "lanai_code_gen",
+    srcs = glob([
+        "lib/Target/Lanai/*.c",
+        "lib/Target/Lanai/*.cpp",
+        "lib/Target/Lanai/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Lanai/*.h",
+        "include/llvm/Target/Lanai/*.def",
+        "include/llvm/Target/Lanai/*.inc",
+        "lib/Target/Lanai/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
+    deps = [
+        ":analysis",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":lanai_asm_parser",
+        ":lanai_desc",
+        ":lanai_info",
+        ":mc",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "lanai_desc",
+    srcs = glob([
+        "lib/Target/Lanai/MCTargetDesc/*.c",
+        "lib/Target/Lanai/MCTargetDesc/*.cpp",
+        "lib/Target/Lanai/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Lanai/MCTargetDesc/*.h",
+        "include/llvm/Target/Lanai/MCTargetDesc/*.def",
+        "include/llvm/Target/Lanai/MCTargetDesc/*.inc",
+        "lib/Target/Lanai/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
+    deps = [
+        ":config",
+        ":lanai_info",
+        ":mc",
+        ":mc_disassembler",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "lanai_disassembler",
+    srcs = glob([
+        "lib/Target/Lanai/Disassembler/*.c",
+        "lib/Target/Lanai/Disassembler/*.cpp",
+        "lib/Target/Lanai/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Lanai/Disassembler/*.h",
+        "include/llvm/Target/Lanai/Disassembler/*.def",
+        "include/llvm/Target/Lanai/Disassembler/*.inc",
+        "lib/Target/Lanai/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
+    deps = [
+        ":config",
+        ":lanai_desc",
+        ":lanai_info",
+        ":mc",
+        ":mc_disassembler",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "lanai_info",
+    srcs = glob([
+        "lib/Target/Lanai/TargetInfo/*.c",
+        "lib/Target/Lanai/TargetInfo/*.cpp",
+        "lib/Target/Lanai/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Lanai/TargetInfo/*.h",
+        "include/llvm/Target/Lanai/TargetInfo/*.def",
+        "include/llvm/Target/Lanai/TargetInfo/*.inc",
+        "lib/Target/Lanai/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Lanai"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "lib_driver",
+    srcs = glob([
+        "lib/ToolDrivers/llvm-lib/*.c",
+        "lib/ToolDrivers/llvm-lib/*.cpp",
+        "lib/ToolDrivers/llvm-lib/*.inc",
+        "lib/ToolDrivers/llvm-lib/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ToolDrivers/llvm-lib/*.h",
+        "include/llvm/ToolDrivers/llvm-lib/*.def",
+        "include/llvm/ToolDrivers/llvm-lib/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":binary_format",
+        ":config",
+        ":object",
+        ":option",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "line_editor",
+    srcs = glob([
+        "lib/LineEditor/*.c",
+        "lib/LineEditor/*.cpp",
+        "lib/LineEditor/*.inc",
+        "lib/LineEditor/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/LineEditor/*.h",
+        "include/llvm/LineEditor/*.def",
+        "include/llvm/LineEditor/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "linker",
+    srcs = glob([
+        "lib/Linker/*.c",
+        "lib/Linker/*.cpp",
+        "lib/Linker/*.inc",
+        "lib/Linker/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Linker/*.h",
+        "include/llvm/Linker/*.def",
+        "include/llvm/Linker/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "mc",
+    srcs = glob([
+        "lib/MC/*.c",
+        "lib/MC/*.cpp",
+        "lib/MC/*.inc",
+        "lib/MC/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/MC/*.h",
+        "include/llvm/MC/*.def",
+        "include/llvm/MC/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":binary_format",
+        ":config",
+        ":debug_info_code_view",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "mca",
+    srcs = glob([
+        "lib/MCA/*.c",
+        "lib/MCA/*.cpp",
+        "lib/MCA/*.inc",
+        "lib/MCA/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/MCA/*.h",
+        "include/llvm/MCA/*.def",
+        "include/llvm/MCA/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "mc_disassembler",
+    srcs = glob([
+        "lib/MC/MCDisassembler/*.c",
+        "lib/MC/MCDisassembler/*.cpp",
+        "lib/MC/MCDisassembler/*.inc",
+        "lib/MC/MCDisassembler/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/MC/MCDisassembler/*.h",
+        "include/llvm/MC/MCDisassembler/*.def",
+        "include/llvm/MC/MCDisassembler/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "mcjit",
+    srcs = glob([
+        "lib/ExecutionEngine/MCJIT/*.c",
+        "lib/ExecutionEngine/MCJIT/*.cpp",
+        "lib/ExecutionEngine/MCJIT/*.inc",
+        "lib/ExecutionEngine/MCJIT/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ExecutionEngine/MCJIT/*.h",
+        "include/llvm/ExecutionEngine/MCJIT/*.def",
+        "include/llvm/ExecutionEngine/MCJIT/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":execution_engine",
+        ":object",
+        ":runtime_dyld",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "mc_parser",
+    srcs = glob([
+        "lib/MC/MCParser/*.c",
+        "lib/MC/MCParser/*.cpp",
+        "lib/MC/MCParser/*.inc",
+        "lib/MC/MCParser/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/MC/MCParser/*.h",
+        "include/llvm/MC/MCParser/*.def",
+        "include/llvm/MC/MCParser/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "mir_parser",
+    srcs = glob([
+        "lib/CodeGen/MIRParser/*.c",
+        "lib/CodeGen/MIRParser/*.cpp",
+        "lib/CodeGen/MIRParser/*.inc",
+        "lib/CodeGen/MIRParser/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/CodeGen/MIRParser/*.h",
+        "include/llvm/CodeGen/MIRParser/*.def",
+        "include/llvm/CodeGen/MIRParser/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":asm_parser",
+        ":binary_format",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":support",
+        ":target",
+    ],
+)
+
 cc_library(
     name = "msp430_asm_parser",
     srcs = glob([
@@ -3970,78 +2356,93 @@ cc_library(
 )
 
 cc_library(
-    name = "aarch64_desc",
+    name = "msp430_code_gen",
     srcs = glob([
-        "lib/Target/AArch64/MCTargetDesc/*.c",
-        "lib/Target/AArch64/MCTargetDesc/*.cpp",
-        "lib/Target/AArch64/MCTargetDesc/*.inc",
+        "lib/Target/MSP430/*.c",
+        "lib/Target/MSP430/*.cpp",
+        "lib/Target/MSP430/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/Target/AArch64/MCTargetDesc/*.h",
-        "include/llvm/Target/AArch64/MCTargetDesc/*.def",
-        "include/llvm/Target/AArch64/MCTargetDesc/*.inc",
-        "lib/Target/AArch64/MCTargetDesc/*.h",
+        "include/llvm/Target/MSP430/*.h",
+        "include/llvm/Target/MSP430/*.def",
+        "include/llvm/Target/MSP430/*.inc",
+        "lib/Target/MSP430/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/AArch64"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/MSP430"],
     deps = [
-        ":aarch64_info",
-        ":aarch64_target_gen",
-        ":aarch64_utils",
-        ":attributes_gen",
-        ":config",
-        ":intrinsic_enums_gen",
-        ":intrinsics_impl_gen",
-        ":mc",
-        ":support",
-    ],
-)
-
-cc_library(
-    name = "analysis",
-    srcs = glob([
-        "lib/Analysis/*.c",
-        "lib/Analysis/*.cpp",
-        "lib/Analysis/*.inc",
-        "include/llvm/Transforms/Utils/Local.h",
-        "include/llvm/Transforms/Scalar.h",
-        "lib/Analysis/*.h",
-    ]),
-    hdrs = glob([
-        "include/llvm/Analysis/*.h",
-        "include/llvm/Analysis/*.def",
-        "include/llvm/Analysis/*.inc",
-    ]),
-    copts = llvm_copts,
-    deps = [
-        ":binary_format",
+        ":asm_printer",
+        ":code_gen",
         ":config",
         ":core",
-        ":object",
-        ":profile_data",
+        ":mc",
+        ":msp430_desc",
+        ":msp430_info",
+        ":selection_dag",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "msp430_desc",
+    srcs = glob([
+        "lib/Target/MSP430/MCTargetDesc/*.c",
+        "lib/Target/MSP430/MCTargetDesc/*.cpp",
+        "lib/Target/MSP430/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/MSP430/MCTargetDesc/*.h",
+        "include/llvm/Target/MSP430/MCTargetDesc/*.def",
+        "include/llvm/Target/MSP430/MCTargetDesc/*.inc",
+        "lib/Target/MSP430/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/MSP430"],
+    deps = [
+        ":config",
+        ":mc",
+        ":msp430_info",
         ":support",
     ],
 )
 
 cc_library(
-    name = "hexagon_disassembler",
+    name = "msp430_disassembler",
     srcs = glob([
-        "lib/Target/Hexagon/Disassembler/*.c",
-        "lib/Target/Hexagon/Disassembler/*.cpp",
-        "lib/Target/Hexagon/Disassembler/*.inc",
+        "lib/Target/MSP430/Disassembler/*.c",
+        "lib/Target/MSP430/Disassembler/*.cpp",
+        "lib/Target/MSP430/Disassembler/*.inc",
     ]),
     hdrs = glob([
-        "include/llvm/Target/Hexagon/Disassembler/*.h",
-        "include/llvm/Target/Hexagon/Disassembler/*.def",
-        "include/llvm/Target/Hexagon/Disassembler/*.inc",
-        "lib/Target/Hexagon/Disassembler/*.h",
+        "include/llvm/Target/MSP430/Disassembler/*.h",
+        "include/llvm/Target/MSP430/Disassembler/*.def",
+        "include/llvm/Target/MSP430/Disassembler/*.inc",
+        "lib/Target/MSP430/Disassembler/*.h",
     ]),
-    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Hexagon"],
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/MSP430"],
     deps = [
         ":config",
-        ":hexagon_desc",
-        ":hexagon_info",
-        ":mc",
         ":mc_disassembler",
+        ":msp430_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "msp430_info",
+    srcs = glob([
+        "lib/Target/MSP430/TargetInfo/*.c",
+        "lib/Target/MSP430/TargetInfo/*.cpp",
+        "lib/Target/MSP430/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/MSP430/TargetInfo/*.h",
+        "include/llvm/Target/MSP430/TargetInfo/*.def",
+        "include/llvm/Target/MSP430/TargetInfo/*.inc",
+        "lib/Target/MSP430/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/MSP430"],
+    deps = [
+        ":config",
         ":support",
     ],
 )
@@ -4069,3 +2470,1602 @@ cc_library(
         ":support",
     ],
 )
+
+cc_library(
+    name = "mips_code_gen",
+    srcs = glob([
+        "lib/Target/Mips/*.c",
+        "lib/Target/Mips/*.cpp",
+        "lib/Target/Mips/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Mips/*.h",
+        "include/llvm/Target/Mips/*.def",
+        "include/llvm/Target/Mips/*.inc",
+        "lib/Target/Mips/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Mips"],
+    deps = [
+        ":analysis",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":global_i_sel",
+        ":mc",
+        ":mips_desc",
+        ":mips_info",
+        ":selection_dag",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "mips_desc",
+    srcs = glob([
+        "lib/Target/Mips/MCTargetDesc/*.c",
+        "lib/Target/Mips/MCTargetDesc/*.cpp",
+        "lib/Target/Mips/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Mips/MCTargetDesc/*.h",
+        "include/llvm/Target/Mips/MCTargetDesc/*.def",
+        "include/llvm/Target/Mips/MCTargetDesc/*.inc",
+        "lib/Target/Mips/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Mips"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mips_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "mips_disassembler",
+    srcs = glob([
+        "lib/Target/Mips/Disassembler/*.c",
+        "lib/Target/Mips/Disassembler/*.cpp",
+        "lib/Target/Mips/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Mips/Disassembler/*.h",
+        "include/llvm/Target/Mips/Disassembler/*.def",
+        "include/llvm/Target/Mips/Disassembler/*.inc",
+        "lib/Target/Mips/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Mips"],
+    deps = [
+        ":config",
+        ":mc_disassembler",
+        ":mips_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "mips_info",
+    srcs = glob([
+        "lib/Target/Mips/TargetInfo/*.c",
+        "lib/Target/Mips/TargetInfo/*.cpp",
+        "lib/Target/Mips/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Mips/TargetInfo/*.h",
+        "include/llvm/Target/Mips/TargetInfo/*.def",
+        "include/llvm/Target/Mips/TargetInfo/*.inc",
+        "lib/Target/Mips/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Mips"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "nvptx_code_gen",
+    srcs = glob([
+        "lib/Target/NVPTX/*.c",
+        "lib/Target/NVPTX/*.cpp",
+        "lib/Target/NVPTX/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/NVPTX/*.h",
+        "include/llvm/Target/NVPTX/*.def",
+        "include/llvm/Target/NVPTX/*.inc",
+        "lib/Target/NVPTX/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/NVPTX"],
+    deps = [
+        ":analysis",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":ipo",
+        ":mc",
+        ":nvptx_desc",
+        ":nvptx_info",
+        ":scalar",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":transform_utils",
+        ":vectorize",
+    ],
+)
+
+cc_library(
+    name = "nvptx_desc",
+    srcs = glob([
+        "lib/Target/NVPTX/MCTargetDesc/*.c",
+        "lib/Target/NVPTX/MCTargetDesc/*.cpp",
+        "lib/Target/NVPTX/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/NVPTX/MCTargetDesc/*.h",
+        "include/llvm/Target/NVPTX/MCTargetDesc/*.def",
+        "include/llvm/Target/NVPTX/MCTargetDesc/*.inc",
+        "lib/Target/NVPTX/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/NVPTX"],
+    deps = [
+        "nvptx_target_gen",
+        ":config",
+        ":mc",
+        ":nvptx_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "nvptx_info",
+    srcs = glob([
+        "lib/Target/NVPTX/TargetInfo/*.c",
+        "lib/Target/NVPTX/TargetInfo/*.cpp",
+        "lib/Target/NVPTX/TargetInfo/*.inc",
+        "lib/Target/NVPTX/MCTargetDesc/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/NVPTX/TargetInfo/*.h",
+        "include/llvm/Target/NVPTX/TargetInfo/*.def",
+        "include/llvm/Target/NVPTX/TargetInfo/*.inc",
+        "lib/Target/NVPTX/NVPTX.h",
+        "lib/Target/NVPTX/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/NVPTX"],
+    deps = [
+        "nvptx_target_gen",
+        ":attributes_gen",
+        ":config",
+        ":core",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "objc_arc",
+    srcs = glob([
+        "lib/Transforms/ObjCARC/*.c",
+        "lib/Transforms/ObjCARC/*.cpp",
+        "lib/Transforms/ObjCARC/*.inc",
+        "include/llvm/Transforms/ObjCARC.h",
+        "lib/Transforms/ObjCARC/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/ObjCARC/*.h",
+        "include/llvm/Transforms/ObjCARC/*.def",
+        "include/llvm/Transforms/ObjCARC/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "object",
+    srcs = glob([
+        "lib/Object/*.c",
+        "lib/Object/*.cpp",
+        "lib/Object/*.inc",
+        "lib/Object/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Object/*.h",
+        "include/llvm/Object/*.def",
+        "include/llvm/Object/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":binary_format",
+        ":bit_reader",
+        ":config",
+        ":core",
+        ":mc",
+        ":mc_parser",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "object_yaml",
+    srcs = glob([
+        "lib/ObjectYAML/*.c",
+        "lib/ObjectYAML/*.cpp",
+        "lib/ObjectYAML/*.inc",
+        "lib/ObjectYAML/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ObjectYAML/*.h",
+        "include/llvm/ObjectYAML/*.def",
+        "include/llvm/ObjectYAML/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":debug_info_code_view",
+        ":object",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "option",
+    srcs = glob([
+        "lib/Option/*.c",
+        "lib/Option/*.cpp",
+        "lib/Option/*.inc",
+        "lib/Option/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Option/*.h",
+        "include/llvm/Option/*.def",
+        "include/llvm/Option/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "orc_jit",
+    srcs = glob([
+        "lib/ExecutionEngine/Orc/*.c",
+        "lib/ExecutionEngine/Orc/*.cpp",
+        "lib/ExecutionEngine/Orc/*.inc",
+        "lib/ExecutionEngine/Orc/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ExecutionEngine/Orc/*.h",
+        "include/llvm/ExecutionEngine/Orc/*.def",
+        "include/llvm/ExecutionEngine/Orc/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":execution_engine",
+        ":jit_link",
+        ":mc",
+        ":object",
+        ":runtime_dyld",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "passes",
+    srcs = glob([
+        "lib/Passes/*.c",
+        "lib/Passes/*.cpp",
+        "lib/Passes/*.inc",
+        "lib/Passes/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Passes/*.h",
+        "include/llvm/Passes/*.def",
+        "include/llvm/Passes/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":aggressive_inst_combine",
+        ":analysis",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":inst_combine",
+        ":instrumentation",
+        ":ipo",
+        ":scalar",
+        ":support",
+        ":target",
+        ":transform_utils",
+        ":vectorize",
+    ],
+)
+
+cc_library(
+    name = "powerpc_asm_parser",
+    srcs = glob([
+        "lib/Target/PowerPC/AsmParser/*.c",
+        "lib/Target/PowerPC/AsmParser/*.cpp",
+        "lib/Target/PowerPC/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/PowerPC/AsmParser/*.h",
+        "include/llvm/Target/PowerPC/AsmParser/*.def",
+        "include/llvm/Target/PowerPC/AsmParser/*.inc",
+        "lib/Target/PowerPC/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":powerpc_desc",
+        ":powerpc_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "powerpc_code_gen",
+    srcs = glob([
+        "lib/Target/PowerPC/*.c",
+        "lib/Target/PowerPC/*.cpp",
+        "lib/Target/PowerPC/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/PowerPC/*.h",
+        "include/llvm/Target/PowerPC/*.def",
+        "include/llvm/Target/PowerPC/*.inc",
+        "lib/Target/PowerPC/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
+    deps = [
+        ":analysis",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":powerpc_desc",
+        ":powerpc_info",
+        ":scalar",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "powerpc_desc",
+    srcs = glob([
+        "lib/Target/PowerPC/MCTargetDesc/*.c",
+        "lib/Target/PowerPC/MCTargetDesc/*.cpp",
+        "lib/Target/PowerPC/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/PowerPC/MCTargetDesc/*.h",
+        "include/llvm/Target/PowerPC/MCTargetDesc/*.def",
+        "include/llvm/Target/PowerPC/MCTargetDesc/*.inc",
+        "lib/Target/PowerPC/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
+    deps = [
+        ":attributes_gen",
+        ":config",
+        ":intrinsic_enums_gen",
+        ":intrinsics_impl_gen",
+        ":mc",
+        ":powerpc_info",
+        ":powerpc_target_gen",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "powerpc_disassembler",
+    srcs = glob([
+        "lib/Target/PowerPC/Disassembler/*.c",
+        "lib/Target/PowerPC/Disassembler/*.cpp",
+        "lib/Target/PowerPC/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/PowerPC/Disassembler/*.h",
+        "include/llvm/Target/PowerPC/Disassembler/*.def",
+        "include/llvm/Target/PowerPC/Disassembler/*.inc",
+        "lib/Target/PowerPC/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
+    deps = [
+        ":config",
+        ":mc_disassembler",
+        ":powerpc_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "powerpc_info",
+    srcs = glob([
+        "lib/Target/PowerPC/TargetInfo/*.c",
+        "lib/Target/PowerPC/TargetInfo/*.cpp",
+        "lib/Target/PowerPC/TargetInfo/*.inc",
+        "lib/Target/PowerPC/MCTargetDesc/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/PowerPC/TargetInfo/*.h",
+        "include/llvm/Target/PowerPC/TargetInfo/*.def",
+        "include/llvm/Target/PowerPC/TargetInfo/*.inc",
+        "lib/Target/PowerPC/PPC*.h",
+        "lib/Target/PowerPC/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/PowerPC"],
+    deps = [
+        ":attributes_gen",
+        ":config",
+        ":core",
+        ":powerpc_target_gen",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "profile_data",
+    srcs = glob([
+        "lib/ProfileData/*.c",
+        "lib/ProfileData/*.cpp",
+        "lib/ProfileData/*.inc",
+        "lib/ProfileData/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ProfileData/*.h",
+        "include/llvm/ProfileData/*.def",
+        "include/llvm/ProfileData/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "riscv_asm_parser",
+    srcs = glob([
+        "lib/Target/RISCV/AsmParser/*.c",
+        "lib/Target/RISCV/AsmParser/*.cpp",
+        "lib/Target/RISCV/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/RISCV/AsmParser/*.h",
+        "include/llvm/Target/RISCV/AsmParser/*.def",
+        "include/llvm/Target/RISCV/AsmParser/*.inc",
+        "lib/Target/RISCV/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":riscv_desc",
+        ":riscv_info",
+        ":riscv_utils",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "riscv_code_gen",
+    srcs = glob([
+        "lib/Target/RISCV/*.c",
+        "lib/Target/RISCV/*.cpp",
+        "lib/Target/RISCV/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/RISCV/*.h",
+        "include/llvm/Target/RISCV/*.def",
+        "include/llvm/Target/RISCV/*.inc",
+        "lib/Target/RISCV/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
+    deps = [
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":riscv_desc",
+        ":riscv_info",
+        ":riscv_utils",
+        ":selection_dag",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "riscv_desc",
+    srcs = glob([
+        "lib/Target/RISCV/MCTargetDesc/*.c",
+        "lib/Target/RISCV/MCTargetDesc/*.cpp",
+        "lib/Target/RISCV/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/RISCV/MCTargetDesc/*.h",
+        "include/llvm/Target/RISCV/MCTargetDesc/*.def",
+        "include/llvm/Target/RISCV/MCTargetDesc/*.inc",
+        "lib/Target/RISCV/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
+    deps = [
+        ":config",
+        ":mc",
+        ":riscv_info",
+        ":riscv_utils",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "riscv_disassembler",
+    srcs = glob([
+        "lib/Target/RISCV/Disassembler/*.c",
+        "lib/Target/RISCV/Disassembler/*.cpp",
+        "lib/Target/RISCV/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/RISCV/Disassembler/*.h",
+        "include/llvm/Target/RISCV/Disassembler/*.def",
+        "include/llvm/Target/RISCV/Disassembler/*.inc",
+        "lib/Target/RISCV/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
+    deps = [
+        ":config",
+        ":mc_disassembler",
+        ":riscv_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "riscv_info",
+    srcs = glob([
+        "lib/Target/RISCV/TargetInfo/*.c",
+        "lib/Target/RISCV/TargetInfo/*.cpp",
+        "lib/Target/RISCV/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/RISCV/TargetInfo/*.h",
+        "include/llvm/Target/RISCV/TargetInfo/*.def",
+        "include/llvm/Target/RISCV/TargetInfo/*.inc",
+        "lib/Target/RISCV/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "riscv_utils",
+    srcs = glob([
+        "lib/Target/RISCV/Utils/*.c",
+        "lib/Target/RISCV/Utils/*.cpp",
+        "lib/Target/RISCV/Utils/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/RISCV/Utils/*.h",
+        "include/llvm/Target/RISCV/Utils/*.def",
+        "include/llvm/Target/RISCV/Utils/*.inc",
+        "lib/Target/RISCV/Utils/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/RISCV"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "remarks",
+    srcs = glob([
+        "lib/Remarks/*.c",
+        "lib/Remarks/*.cpp",
+        "lib/Remarks/*.inc",
+        "lib/Remarks/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Remarks/*.h",
+        "include/llvm/Remarks/*.def",
+        "include/llvm/Remarks/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "runtime_dyld",
+    srcs = glob([
+        "lib/ExecutionEngine/RuntimeDyld/*.c",
+        "lib/ExecutionEngine/RuntimeDyld/*.cpp",
+        "lib/ExecutionEngine/RuntimeDyld/*.inc",
+        "include/llvm/ExecutionEngine/JITSymbol.h",
+        "include/llvm/ExecutionEngine/RTDyldMemoryManager.h",
+        "lib/ExecutionEngine/RuntimeDyld/*.h",
+        "lib/ExecutionEngine/RuntimeDyld/Targets/*.h",
+        "lib/ExecutionEngine/RuntimeDyld/Targets/*.cpp",
+        "lib/ExecutionEngine/RuntimeDyld/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/ExecutionEngine/RuntimeDyld/*.h",
+        "include/llvm/ExecutionEngine/RuntimeDyld/*.def",
+        "include/llvm/ExecutionEngine/RuntimeDyld/*.inc",
+        "include/llvm/DebugInfo/DIContext.h",
+        "include/llvm/ExecutionEngine/RTDyldMemoryManager.h",
+        "include/llvm/ExecutionEngine/RuntimeDyld*.h",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_disassembler",
+        ":object",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "scalar",
+    srcs = glob([
+        "lib/Transforms/Scalar/*.c",
+        "lib/Transforms/Scalar/*.cpp",
+        "lib/Transforms/Scalar/*.inc",
+        "include/llvm-c/Transforms/Scalar.h",
+        "include/llvm/Transforms/Scalar.h",
+        "include/llvm/Target/TargetMachine.h",
+        "lib/Transforms/Scalar/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/Scalar/*.h",
+        "include/llvm/Transforms/Scalar/*.def",
+        "include/llvm/Transforms/Scalar/*.inc",
+        "include/llvm/Transforms/IPO.h",
+        "include/llvm/Transforms/IPO/SCCP.h",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":aggressive_inst_combine",
+        ":analysis",
+        ":config",
+        ":core",
+        ":inst_combine",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "selection_dag",
+    srcs = glob([
+        "lib/CodeGen/SelectionDAG/*.c",
+        "lib/CodeGen/SelectionDAG/*.cpp",
+        "lib/CodeGen/SelectionDAG/*.inc",
+        "lib/CodeGen/SelectionDAG/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/CodeGen/SelectionDAG/*.h",
+        "include/llvm/CodeGen/SelectionDAG/*.def",
+        "include/llvm/CodeGen/SelectionDAG/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":support",
+        ":target",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "sparc_asm_parser",
+    srcs = glob([
+        "lib/Target/Sparc/AsmParser/*.c",
+        "lib/Target/Sparc/AsmParser/*.cpp",
+        "lib/Target/Sparc/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Sparc/AsmParser/*.h",
+        "include/llvm/Target/Sparc/AsmParser/*.def",
+        "include/llvm/Target/Sparc/AsmParser/*.inc",
+        "lib/Target/Sparc/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":sparc_desc",
+        ":sparc_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "sparc_code_gen",
+    srcs = glob([
+        "lib/Target/Sparc/*.c",
+        "lib/Target/Sparc/*.cpp",
+        "lib/Target/Sparc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Sparc/*.h",
+        "include/llvm/Target/Sparc/*.def",
+        "include/llvm/Target/Sparc/*.inc",
+        "lib/Target/Sparc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
+    deps = [
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":selection_dag",
+        ":sparc_desc",
+        ":sparc_info",
+        ":support",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "sparc_desc",
+    srcs = glob([
+        "lib/Target/Sparc/MCTargetDesc/*.c",
+        "lib/Target/Sparc/MCTargetDesc/*.cpp",
+        "lib/Target/Sparc/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Sparc/MCTargetDesc/*.h",
+        "include/llvm/Target/Sparc/MCTargetDesc/*.def",
+        "include/llvm/Target/Sparc/MCTargetDesc/*.inc",
+        "lib/Target/Sparc/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
+    deps = [
+        ":config",
+        ":mc",
+        ":sparc_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "sparc_disassembler",
+    srcs = glob([
+        "lib/Target/Sparc/Disassembler/*.c",
+        "lib/Target/Sparc/Disassembler/*.cpp",
+        "lib/Target/Sparc/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Sparc/Disassembler/*.h",
+        "include/llvm/Target/Sparc/Disassembler/*.def",
+        "include/llvm/Target/Sparc/Disassembler/*.inc",
+        "lib/Target/Sparc/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
+    deps = [
+        ":config",
+        ":mc_disassembler",
+        ":sparc_info",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "sparc_info",
+    srcs = glob([
+        "lib/Target/Sparc/TargetInfo/*.c",
+        "lib/Target/Sparc/TargetInfo/*.cpp",
+        "lib/Target/Sparc/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/Sparc/TargetInfo/*.h",
+        "include/llvm/Target/Sparc/TargetInfo/*.def",
+        "include/llvm/Target/Sparc/TargetInfo/*.inc",
+        "lib/Target/Sparc/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/Sparc"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "support",
+    srcs = glob([
+        "lib/Support/*.c",
+        "lib/Support/*.cpp",
+        "lib/Support/*.inc",
+        "include/llvm-c/*.h",
+        "include/llvm/CodeGen/MachineValueType.h",
+        "include/llvm/BinaryFormat/COFF.h",
+        "include/llvm/BinaryFormat/MachO.h",
+        "lib/Support/*.h",
+    ]) + llvm_support_platform_specific_srcs_glob(),
+    hdrs = glob([
+        "include/llvm/Support/*.h",
+        "include/llvm/Support/*.def",
+        "include/llvm/Support/*.inc",
+        "include/llvm/ADT/*.h",
+        "include/llvm/Support/ELFRelocs/*.def",
+        "include/llvm/Support/WasmRelocs/*.def",
+    ]) + [
+        "include/llvm/BinaryFormat/MachO.def",
+        "include/llvm/Support/VCSRevision.h",
+    ],
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":demangle",
+        "@zlib_archive//:zlib",
+    ],
+)
+
+cc_library(
+    name = "symbolize",
+    srcs = glob([
+        "lib/DebugInfo/Symbolize/*.c",
+        "lib/DebugInfo/Symbolize/*.cpp",
+        "lib/DebugInfo/Symbolize/*.inc",
+        "lib/DebugInfo/Symbolize/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/DebugInfo/Symbolize/*.h",
+        "include/llvm/DebugInfo/Symbolize/*.def",
+        "include/llvm/DebugInfo/Symbolize/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":debug_info_dwarf",
+        ":debug_info_pdb",
+        ":demangle",
+        ":object",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "system_z_asm_parser",
+    srcs = glob([
+        "lib/Target/SystemZ/AsmParser/*.c",
+        "lib/Target/SystemZ/AsmParser/*.cpp",
+        "lib/Target/SystemZ/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/SystemZ/AsmParser/*.h",
+        "include/llvm/Target/SystemZ/AsmParser/*.def",
+        "include/llvm/Target/SystemZ/AsmParser/*.inc",
+        "lib/Target/SystemZ/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":support",
+        ":system_z_desc",
+        ":system_z_info",
+    ],
+)
+
+cc_library(
+    name = "system_z_code_gen",
+    srcs = glob([
+        "lib/Target/SystemZ/*.c",
+        "lib/Target/SystemZ/*.cpp",
+        "lib/Target/SystemZ/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/SystemZ/*.h",
+        "include/llvm/Target/SystemZ/*.def",
+        "include/llvm/Target/SystemZ/*.inc",
+        "lib/Target/SystemZ/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
+    deps = [
+        ":analysis",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":scalar",
+        ":selection_dag",
+        ":support",
+        ":system_z_desc",
+        ":system_z_info",
+        ":target",
+    ],
+)
+
+cc_library(
+    name = "system_z_desc",
+    srcs = glob([
+        "lib/Target/SystemZ/MCTargetDesc/*.c",
+        "lib/Target/SystemZ/MCTargetDesc/*.cpp",
+        "lib/Target/SystemZ/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/SystemZ/MCTargetDesc/*.h",
+        "include/llvm/Target/SystemZ/MCTargetDesc/*.def",
+        "include/llvm/Target/SystemZ/MCTargetDesc/*.inc",
+        "lib/Target/SystemZ/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+        ":system_z_info",
+    ],
+)
+
+cc_library(
+    name = "system_z_disassembler",
+    srcs = glob([
+        "lib/Target/SystemZ/Disassembler/*.c",
+        "lib/Target/SystemZ/Disassembler/*.cpp",
+        "lib/Target/SystemZ/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/SystemZ/Disassembler/*.h",
+        "include/llvm/Target/SystemZ/Disassembler/*.def",
+        "include/llvm/Target/SystemZ/Disassembler/*.inc",
+        "lib/Target/SystemZ/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_disassembler",
+        ":support",
+        ":system_z_desc",
+        ":system_z_info",
+    ],
+)
+
+cc_library(
+    name = "system_z_info",
+    srcs = glob([
+        "lib/Target/SystemZ/TargetInfo/*.c",
+        "lib/Target/SystemZ/TargetInfo/*.cpp",
+        "lib/Target/SystemZ/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/SystemZ/TargetInfo/*.h",
+        "include/llvm/Target/SystemZ/TargetInfo/*.def",
+        "include/llvm/Target/SystemZ/TargetInfo/*.inc",
+        "lib/Target/SystemZ/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/SystemZ"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "table_gen",
+    srcs = glob([
+        "lib/TableGen/*.c",
+        "lib/TableGen/*.cpp",
+        "lib/TableGen/*.inc",
+        "include/llvm/CodeGen/*.h",
+        "lib/TableGen/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/TableGen/*.h",
+        "include/llvm/TableGen/*.def",
+        "include/llvm/TableGen/*.inc",
+        "include/llvm/Target/*.def",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "target",
+    srcs = glob([
+        "lib/Target/*.c",
+        "lib/Target/*.cpp",
+        "lib/Target/*.inc",
+        "include/llvm/CodeGen/*.h",
+        "include/llvm-c/Initialization.h",
+        "include/llvm-c/Target.h",
+        "lib/Target/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/*.h",
+        "include/llvm/Target/*.def",
+        "include/llvm/Target/*.inc",
+        "include/llvm/CodeGen/*.def",
+        "include/llvm/CodeGen/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":mc",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "testing_support",
+    srcs = glob([
+        "lib/Testing/Support/*.c",
+        "lib/Testing/Support/*.cpp",
+        "lib/Testing/Support/*.inc",
+        "lib/Testing/Support/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Testing/Support/*.h",
+        "include/llvm/Testing/Support/*.def",
+        "include/llvm/Testing/Support/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "text_api",
+    srcs = glob([
+        "lib/TextAPI/*.c",
+        "lib/TextAPI/*.cpp",
+        "lib/TextAPI/*.inc",
+        "lib/TextAPI/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/TextAPI/*.h",
+        "include/llvm/TextAPI/*.def",
+        "include/llvm/TextAPI/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":binary_format",
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "transform_utils",
+    srcs = glob([
+        "lib/Transforms/Utils/*.c",
+        "lib/Transforms/Utils/*.cpp",
+        "lib/Transforms/Utils/*.inc",
+        "include/llvm/Transforms/IPO.h",
+        "include/llvm/Transforms/Scalar.h",
+        "lib/Transforms/Utils/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/Utils/*.h",
+        "include/llvm/Transforms/Utils/*.def",
+        "include/llvm/Transforms/Utils/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "vectorize",
+    srcs = glob([
+        "lib/Transforms/Vectorize/*.c",
+        "lib/Transforms/Vectorize/*.cpp",
+        "lib/Transforms/Vectorize/*.inc",
+        "include/llvm-c/Transforms/Vectorize.h",
+        "lib/Transforms/Vectorize/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/Vectorize/*.h",
+        "include/llvm/Transforms/Vectorize/*.def",
+        "include/llvm/Transforms/Vectorize/*.inc",
+        "include/llvm/Transforms/Vectorize.h",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":scalar",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
+cc_library(
+    name = "web_assembly_asm_parser",
+    srcs = glob([
+        "lib/Target/WebAssembly/AsmParser/*.c",
+        "lib/Target/WebAssembly/AsmParser/*.cpp",
+        "lib/Target/WebAssembly/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/WebAssembly/AsmParser/*.h",
+        "include/llvm/Target/WebAssembly/AsmParser/*.def",
+        "include/llvm/Target/WebAssembly/AsmParser/*.inc",
+        "lib/Target/WebAssembly/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":support",
+        ":web_assembly_info",
+    ],
+)
+
+cc_library(
+    name = "web_assembly_code_gen",
+    srcs = glob([
+        "lib/Target/WebAssembly/*.c",
+        "lib/Target/WebAssembly/*.cpp",
+        "lib/Target/WebAssembly/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/WebAssembly/*.h",
+        "include/llvm/Target/WebAssembly/*.def",
+        "include/llvm/Target/WebAssembly/*.inc",
+        "lib/Target/WebAssembly/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
+    deps = [
+        ":analysis",
+        ":asm_printer",
+        ":binary_format",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":scalar",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":transform_utils",
+        ":web_assembly_desc",
+        ":web_assembly_info",
+    ],
+)
+
+cc_library(
+    name = "web_assembly_desc",
+    srcs = glob([
+        "lib/Target/WebAssembly/MCTargetDesc/*.c",
+        "lib/Target/WebAssembly/MCTargetDesc/*.cpp",
+        "lib/Target/WebAssembly/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/WebAssembly/MCTargetDesc/*.h",
+        "include/llvm/Target/WebAssembly/MCTargetDesc/*.def",
+        "include/llvm/Target/WebAssembly/MCTargetDesc/*.inc",
+        "lib/Target/WebAssembly/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+        ":web_assembly_info",
+    ],
+)
+
+cc_library(
+    name = "web_assembly_disassembler",
+    srcs = glob([
+        "lib/Target/WebAssembly/Disassembler/*.c",
+        "lib/Target/WebAssembly/Disassembler/*.cpp",
+        "lib/Target/WebAssembly/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/WebAssembly/Disassembler/*.h",
+        "include/llvm/Target/WebAssembly/Disassembler/*.def",
+        "include/llvm/Target/WebAssembly/Disassembler/*.inc",
+        "lib/Target/WebAssembly/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
+    deps = [
+        ":config",
+        ":mc_disassembler",
+        ":support",
+        ":web_assembly_desc",
+        ":web_assembly_info",
+    ],
+)
+
+cc_library(
+    name = "web_assembly_info",
+    srcs = glob([
+        "lib/Target/WebAssembly/TargetInfo/*.c",
+        "lib/Target/WebAssembly/TargetInfo/*.cpp",
+        "lib/Target/WebAssembly/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/WebAssembly/TargetInfo/*.h",
+        "include/llvm/Target/WebAssembly/TargetInfo/*.def",
+        "include/llvm/Target/WebAssembly/TargetInfo/*.inc",
+        "lib/Target/WebAssembly/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/WebAssembly"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "windows_manifest",
+    srcs = glob([
+        "lib/WindowsManifest/*.c",
+        "lib/WindowsManifest/*.cpp",
+        "lib/WindowsManifest/*.inc",
+        "lib/WindowsManifest/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/WindowsManifest/*.h",
+        "include/llvm/WindowsManifest/*.def",
+        "include/llvm/WindowsManifest/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "x86_asm_parser",
+    srcs = glob([
+        "lib/Target/X86/AsmParser/*.c",
+        "lib/Target/X86/AsmParser/*.cpp",
+        "lib/Target/X86/AsmParser/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/X86/AsmParser/*.h",
+        "include/llvm/Target/X86/AsmParser/*.def",
+        "include/llvm/Target/X86/AsmParser/*.inc",
+        "lib/Target/X86/AsmParser/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_parser",
+        ":support",
+        ":x86_desc",
+        ":x86_info",
+    ],
+)
+
+cc_library(
+    name = "x86_code_gen",
+    srcs = glob([
+        "lib/Target/X86/*.c",
+        "lib/Target/X86/*.cpp",
+        "lib/Target/X86/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/X86/*.h",
+        "include/llvm/Target/X86/*.def",
+        "include/llvm/Target/X86/*.inc",
+        "lib/Target/X86/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
+    deps = [
+        ":analysis",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":global_i_sel",
+        ":mc",
+        ":profile_data",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":x86_defs",
+        ":x86_desc",
+        ":x86_info",
+        ":x86_utils",
+    ],
+)
+
+cc_library(
+    name = "x86_desc",
+    srcs = glob([
+        "lib/Target/X86/MCTargetDesc/*.c",
+        "lib/Target/X86/MCTargetDesc/*.cpp",
+        "lib/Target/X86/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/X86/MCTargetDesc/*.h",
+        "include/llvm/Target/X86/MCTargetDesc/*.def",
+        "include/llvm/Target/X86/MCTargetDesc/*.inc",
+        "lib/Target/X86/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
+    deps = [
+        ":config",
+        ":mc",
+        ":mc_disassembler",
+        ":object",
+        ":support",
+        ":x86_info",
+        ":x86_utils",
+    ],
+)
+
+cc_library(
+    name = "x86_disassembler",
+    srcs = glob([
+        "lib/Target/X86/Disassembler/*.c",
+        "lib/Target/X86/Disassembler/*.cpp",
+        "lib/Target/X86/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/X86/Disassembler/*.h",
+        "include/llvm/Target/X86/Disassembler/*.def",
+        "include/llvm/Target/X86/Disassembler/*.inc",
+        "lib/Target/X86/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
+    deps = [
+        ":config",
+        ":mc_disassembler",
+        ":support",
+        ":x86_info",
+    ],
+)
+
+cc_library(
+    name = "x86_info",
+    srcs = glob([
+        "lib/Target/X86/TargetInfo/*.c",
+        "lib/Target/X86/TargetInfo/*.cpp",
+        "lib/Target/X86/TargetInfo/*.inc",
+        "lib/Target/X86/MCTargetDesc/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/X86/TargetInfo/*.h",
+        "include/llvm/Target/X86/TargetInfo/*.def",
+        "include/llvm/Target/X86/TargetInfo/*.inc",
+        "lib/Target/X86/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+        ":x86_target_gen",
+    ],
+)
+
+cc_library(
+    name = "x86_utils",
+    srcs = glob([
+        "lib/Target/X86/Utils/*.c",
+        "lib/Target/X86/Utils/*.cpp",
+        "lib/Target/X86/Utils/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/X86/Utils/*.h",
+        "include/llvm/Target/X86/Utils/*.def",
+        "include/llvm/Target/X86/Utils/*.inc",
+        "lib/Target/X86/Utils/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/X86"],
+    deps = [
+        ":code_gen",
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "x_core_code_gen",
+    srcs = glob([
+        "lib/Target/XCore/*.c",
+        "lib/Target/XCore/*.cpp",
+        "lib/Target/XCore/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/XCore/*.h",
+        "include/llvm/Target/XCore/*.def",
+        "include/llvm/Target/XCore/*.inc",
+        "lib/Target/XCore/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/XCore"],
+    deps = [
+        ":analysis",
+        ":asm_printer",
+        ":code_gen",
+        ":config",
+        ":core",
+        ":mc",
+        ":selection_dag",
+        ":support",
+        ":target",
+        ":transform_utils",
+        ":x_core_desc",
+        ":x_core_info",
+    ],
+)
+
+cc_library(
+    name = "x_core_desc",
+    srcs = glob([
+        "lib/Target/XCore/MCTargetDesc/*.c",
+        "lib/Target/XCore/MCTargetDesc/*.cpp",
+        "lib/Target/XCore/MCTargetDesc/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/XCore/MCTargetDesc/*.h",
+        "include/llvm/Target/XCore/MCTargetDesc/*.def",
+        "include/llvm/Target/XCore/MCTargetDesc/*.inc",
+        "lib/Target/XCore/MCTargetDesc/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/XCore"],
+    deps = [
+        ":config",
+        ":mc",
+        ":support",
+        ":x_core_info",
+    ],
+)
+
+cc_library(
+    name = "x_core_disassembler",
+    srcs = glob([
+        "lib/Target/XCore/Disassembler/*.c",
+        "lib/Target/XCore/Disassembler/*.cpp",
+        "lib/Target/XCore/Disassembler/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/XCore/Disassembler/*.h",
+        "include/llvm/Target/XCore/Disassembler/*.def",
+        "include/llvm/Target/XCore/Disassembler/*.inc",
+        "lib/Target/XCore/Disassembler/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/XCore"],
+    deps = [
+        ":config",
+        ":mc_disassembler",
+        ":support",
+        ":x_core_info",
+    ],
+)
+
+cc_library(
+    name = "x_core_info",
+    srcs = glob([
+        "lib/Target/XCore/TargetInfo/*.c",
+        "lib/Target/XCore/TargetInfo/*.cpp",
+        "lib/Target/XCore/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/XCore/TargetInfo/*.h",
+        "include/llvm/Target/XCore/TargetInfo/*.def",
+        "include/llvm/Target/XCore/TargetInfo/*.inc",
+        "lib/Target/XCore/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm/lib/Target/XCore"],
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "x_ray",
+    srcs = glob([
+        "lib/XRay/*.c",
+        "lib/XRay/*.cpp",
+        "lib/XRay/*.inc",
+        "lib/XRay/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/XRay/*.h",
+        "include/llvm/XRay/*.def",
+        "include/llvm/XRay/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":object",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "gtest",
+    srcs = glob([
+        "utils/unittest/*.c",
+        "utils/unittest/*.cpp",
+        "utils/unittest/*.inc",
+        "utils/unittest/*.h",
+    ]),
+    hdrs = glob([
+        "utils/unittest/*.h",
+        "utils/unittest/*.def",
+        "utils/unittest/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":support",
+    ],
+)
+
+cc_library(
+    name = "gtest_main",
+    srcs = glob([
+        "utils/unittest/*.c",
+        "utils/unittest/*.cpp",
+        "utils/unittest/*.inc",
+        "utils/unittest/*.h",
+    ]),
+    hdrs = glob([
+        "utils/unittest/*.h",
+        "utils/unittest/*.def",
+        "utils/unittest/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":gtest",
+    ],
+)
diff --git a/third_party/png_fix_rpi.patch b/third_party/png_fix_rpi.patch
index 80da7b3c064..e07eb2f6c1b 100644
--- a/third_party/png_fix_rpi.patch
+++ b/third_party/png_fix_rpi.patch
@@ -1,7 +1,7 @@
-diff -r -u /tmp/libpng-1.6.34/scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf.h.prebuilt
---- /tmp/libpng-1.6.34/scripts/pnglibconf.h.prebuilt	2017-09-29 01:42:33.000000000 -0700
-+++ ./scripts/pnglibconf.h.prebuilt	2018-05-01 09:51:24.719318242 -0700
-@@ -20,6 +20,12 @@
+diff -r -u /tmp/libpng-1.6.37/scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf.h.prebuilt
+--- /tmp/libpng-1.6.37/scripts/pnglibconf.h.prebuilt    2019-04-14 11:10:32.000000000 -0700
++++ ./scripts/pnglibconf.h.prebuilt     2019-05-21 09:40:52.138528512 -0700
+@@ -19,6 +19,12 @@
  #define PNG_ALIGNED_MEMORY_SUPPORTED
  /*#undef PNG_ARM_NEON_API_SUPPORTED*/
  /*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/
@@ -11,6 +11,6 @@ diff -r -u /tmp/libpng-1.6.34/scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf
 +#define PNG_ARM_NEON_OPT 0
 +
 +
- /*#undef PNG_POWERPC_VSX_API_SUPPORTED*/
- /*#undef PNG_POWERPC_VSX_CHECK_SUPPORTED*/
  #define PNG_BENIGN_ERRORS_SUPPORTED
+ #define PNG_BENIGN_READ_ERRORS_SUPPORTED
+ /*#undef PNG_BENIGN_WRITE_ERRORS_SUPPORTED*/
diff --git a/third_party/sycl/BUILD b/third_party/sycl/BUILD
index f631b6df06d..2b86f73b98b 100644
--- a/third_party/sycl/BUILD
+++ b/third_party/sycl/BUILD
@@ -1,3 +1,4 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/third_party/sycl/sycl/BUILD b/third_party/sycl/sycl/BUILD
index b0456099543..65f5a8414c4 100644
--- a/third_party/sycl/sycl/BUILD
+++ b/third_party/sycl/sycl/BUILD
@@ -2,6 +2,7 @@
 # A minimal BUILD file to make template files in this folder available. Without this BUILD file,
 # bazel returns errors when trying to access tpl files in this folder.
 
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 2ade5093f41..81612e52124 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -22,6 +22,15 @@ _TF_NEED_TENSORRT = "TF_NEED_TENSORRT"
 
 _TF_TENSORRT_LIBS = ["nvinfer", "nvinfer_plugin"]
 _TF_TENSORRT_HEADERS = ["NvInfer.h", "NvUtils.h", "NvInferPlugin.h"]
+_TF_TENSORRT_HEADERS_V6 = [
+    "NvInfer.h",
+    "NvUtils.h",
+    "NvInferPlugin.h",
+    "NvInferVersion.h",
+    "NvInferRTSafe.h",
+    "NvInferRTExt.h",
+    "NvInferPluginUtils.h",
+]
 
 _DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
 _DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
@@ -32,18 +41,10 @@ def _at_least_version(actual_version, required_version):
     required = [int(v) for v in required_version.split(".")]
     return actual >= required
 
-def _update_tensorrt_headers(tensorrt_version):
-    if not _at_least_version(tensorrt_version, "6"):
-        return
-    _TF_TENSORRT_HEADERS = [
-        "NvInferVersion.h",
-        "NvInfer.h",
-        "NvUtils.h",
-        "NvInferPlugin.h",
-        "NvInferRTSafe.h",
-        "NvInferRTExt.h",
-        "NvInferPluginUtils.h",
-    ]
+def _get_tensorrt_headers(tensorrt_version):
+    if _at_least_version(tensorrt_version, "6"):
+        return _TF_TENSORRT_HEADERS_V6
+    return _TF_TENSORRT_HEADERS
 
 def _tpl(repository_ctx, tpl, substitutions):
     repository_ctx.template(
@@ -87,10 +88,9 @@ def _tensorrt_configure_impl(repository_ctx):
     cpu_value = get_cpu_value(repository_ctx)
 
     # Copy the library and header files.
-    _update_tensorrt_headers(trt_version)
     libraries = [lib_name(lib, cpu_value, trt_version) for lib in _TF_TENSORRT_LIBS]
     library_dir = config["tensorrt_library_dir"] + "/"
-    headers = _TF_TENSORRT_HEADERS
+    headers = _get_tensorrt_headers(trt_version)
     include_dir = config["tensorrt_include_dir"] + "/"
     copy_rules = [
         make_copy_files_rule(
diff --git a/third_party/toolchains/preconfig/centos6/py3/BUILD b/third_party/toolchains/preconfig/centos6/py3/BUILD
index dbc852d8cc0..ac17c471427 100755
--- a/third_party/toolchains/preconfig/centos6/py3/BUILD
+++ b/third_party/toolchains/preconfig/centos6/py3/BUILD
@@ -93,6 +93,7 @@ genrule(
         "python_include/odictobject.h",
         "python_include/opcode.h",
         "python_include/osdefs.h",
+        "python_include/osmodule.h",
         "python_include/parsetok.h",
         "python_include/patchlevel.h",
         "python_include/pgen.h",
@@ -101,10 +102,10 @@ genrule(
         "python_include/pyarena.h",
         "python_include/pyatomic.h",
         "python_include/pycapsule.h",
-        "python_include/pyconfig-64.h",
         "python_include/pyconfig.h",
         "python_include/pyctype.h",
         "python_include/pydebug.h",
+        "python_include/pydtrace.h",
         "python_include/pyerrors.h",
         "python_include/pyexpat.h",
         "python_include/pyfpe.h",
@@ -140,7 +141,7 @@ genrule(
         "python_include/weakrefobject.h",
     ],
     cmd = """
-cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/code.h" "$(@D)/python_include/code.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/import.h" "$(@D)/python_include/import.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/node.h" "$(@D)/python_include/node.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/object.h" "$(@D)/python_include/object.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyconfig-64.h" "$(@D)/python_include/pyconfig-64.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/token.h" "$(@D)/python_include/token.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/opt/rh/rh-python35/root/usr/include/python3.5m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+cp -f "/usr/local/include/python3.6m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/local/include/python3.6m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/local/include/python3.6m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/local/include/python3.6m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/local/include/python3.6m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/local/include/python3.6m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/local/include/python3.6m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/local/include/python3.6m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/local/include/python3.6m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/local/include/python3.6m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/local/include/python3.6m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/local/include/python3.6m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/local/include/python3.6m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/local/include/python3.6m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/local/include/python3.6m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/local/include/python3.6m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/local/include/python3.6m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/local/include/python3.6m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/local/include/python3.6m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/local/include/python3.6m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/local/include/python3.6m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/local/include/python3.6m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/local/include/python3.6m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/local/include/python3.6m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/local/include/python3.6m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/local/include/python3.6m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/local/include/python3.6m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/local/include/python3.6m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/local/include/python3.6m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/local/include/python3.6m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/local/include/python3.6m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/local/include/python3.6m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/local/include/python3.6m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/local/include/python3.6m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/local/include/python3.6m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/local/include/python3.6m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/local/include/python3.6m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/local/include/python3.6m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/local/include/python3.6m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/local/include/python3.6m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/local/include/python3.6m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/local/include/python3.6m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/local/include/python3.6m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/local/include/python3.6m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/local/include/python3.6m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/local/include/python3.6m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/local/include/python3.6m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/local/include/python3.6m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/local/include/python3.6m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/local/include/python3.6m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/local/include/python3.6m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/local/include/python3.6m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/usr/local/include/python3.6m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/local/include/python3.6m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/local/include/python3.6m/osmodule.h" "$(@D)/python_include/osmodule.h" && cp -f "/usr/local/include/python3.6m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/local/include/python3.6m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/local/include/python3.6m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/local/include/python3.6m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/local/include/python3.6m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/local/include/python3.6m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/local/include/python3.6m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/local/include/python3.6m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/local/include/python3.6m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/local/include/python3.6m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/local/include/python3.6m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/local/include/python3.6m/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp -f "/usr/local/include/python3.6m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/local/include/python3.6m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/local/include/python3.6m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/local/include/python3.6m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/local/include/python3.6m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/local/include/python3.6m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/usr/local/include/python3.6m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/local/include/python3.6m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/local/include/python3.6m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/local/include/python3.6m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/local/include/python3.6m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/local/include/python3.6m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/local/include/python3.6m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/local/include/python3.6m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/usr/local/include/python3.6m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/local/include/python3.6m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/local/include/python3.6m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/local/include/python3.6m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/local/include/python3.6m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/local/include/python3.6m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/local/include/python3.6m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/local/include/python3.6m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/local/include/python3.6m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/local/include/python3.6m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/local/include/python3.6m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/local/include/python3.6m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/local/include/python3.6m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/local/include/python3.6m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/local/include/python3.6m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/local/include/python3.6m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/local/include/python3.6m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/local/include/python3.6m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/local/include/python3.6m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
    """,
 )
 
@@ -175,6 +176,6 @@ genrule(
         "numpy_include/numpy/utils.h",
     ],
     cmd = """
-cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/opt/rh/rh-python35/root/usr/lib64/python3.5/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
    """,
 )
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
index 2d8e7332023..749fe31eb68 100644
--- a/third_party/toolchains/preconfig/generate/BUILD
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -11,6 +11,7 @@ tensorflow_rbe_config(
 
 tensorflow_rbe_config(
     name = "centos6-py-gcc7",
+    build_bazel_src = True,
     compiler = "gcc",
     compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
     os = "centos6",
@@ -19,6 +20,7 @@ tensorflow_rbe_config(
 
 tensorflow_rbe_config(
     name = "centos6-py3-gcc7",
+    build_bazel_src = True,
     compiler = "gcc",
     compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
     os = "centos6",
diff --git a/third_party/toolchains/preconfig/generate/archives.bzl b/third_party/toolchains/preconfig/generate/archives.bzl
index bbe8e20219d..7d1cbc719de 100644
--- a/third_party/toolchains/preconfig/generate/archives.bzl
+++ b/third_party/toolchains/preconfig/generate/archives.bzl
@@ -3,10 +3,10 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 def bazel_toolchains_archive():
     http_archive(
         name = "bazel_toolchains",
-        sha256 = "109a99384f9d08f9e75136d218ebaebc68cc810c56897aea2224c57932052d30",
-        strip_prefix = "bazel-toolchains-94d31935a2c94fe7e7c7379a0f3393e181928ff7",
+        sha256 = "88e818f9f03628eef609c8429c210ecf265ffe46c2af095f36c7ef8b1855fef5",
+        strip_prefix = "bazel-toolchains-92dd8a7a518a2fb7ba992d47c8b38299fe0be825",
         urls = [
-            "http://mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/94d31935a2c94fe7e7c7379a0f3393e181928ff7.tar.gz",
-            "https://github.com/bazelbuild/bazel-toolchains/archive/94d31935a2c94fe7e7c7379a0f3393e181928ff7.tar.gz",
+            "http://mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/92dd8a7a518a2fb7ba992d47c8b38299fe0be825.tar.gz",
+            "https://github.com/bazelbuild/bazel-toolchains/archive/92dd8a7a518a2fb7ba992d47c8b38299fe0be825.tar.gz",
         ],
     )
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 8320c920200..3edb14c55f8 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,7 +1,7 @@
 """SHA 256 values for each image."""
 container_digests = {
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
-    "centos6": "sha256:e1dfcd0109eedb51e147bf8f516f97a11f2f1f45b31b40d30828a66be101b7cd",
+    "centos6": "sha256:5450c88850260a5d46f4942c3807e2240eb5e792fce5e788bacf2317309969f1",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a874e7a102abb7da5861dd468b68a25f360bb976a39a26c1e123b770e7900322",
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
index ec4b7c15d45..accdde45e77 100644
--- a/third_party/toolchains/preconfig/generate/generate.bzl
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -3,7 +3,7 @@ load(
     "docker_toolchain_autoconfig",
 )
 
-def _tensorflow_rbe_config(name, compiler, python_version, os, cuda_version = None, cudnn_version = None, tensorrt_version = None, tensorrt_install_path = None, cudnn_install_path = None, compiler_prefix = None):
+def _tensorflow_rbe_config(name, compiler, python_version, os, cuda_version = None, cudnn_version = None, tensorrt_version = None, tensorrt_install_path = None, cudnn_install_path = None, compiler_prefix = None, build_bazel_src = False):
     base = "@%s//image" % os
     config_repos = [
         "local_config_python",
@@ -54,11 +54,11 @@ def _tensorflow_rbe_config(name, compiler, python_version, os, cuda_version = No
         name = name,
         base = base,
         bazel_version = "0.24.1",
+        build_bazel_src = build_bazel_src,
         config_repos = config_repos,
         env = env,
         mount_project = "$(mount_project)",
         tags = ["manual"],
-        incompatible_changes_off = True,
     )
 
 tensorflow_rbe_config = _tensorflow_rbe_config
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
index 4f24f58e62b..399efccfdad 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
@@ -1,3 +1,8 @@
+# This file is expanded from a template by cuda_configure.bzl
+# Update cuda_configure.bzl#verify_build_defines when adding new variables.
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
 licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
@@ -35,7 +40,6 @@ cc_toolchain(
     name = "cc-compiler-local",
     all_files = ":crosstool_wrapper_driver_is_not_gcc",
     compiler_files = ":empty",
-    cpu = "local",
     dwp_files = ":empty",
     linker_files = ":crosstool_wrapper_driver_is_not_gcc",
     objcopy_files = ":empty",
@@ -45,35 +49,113 @@ cc_toolchain(
     # last on the command line and contain all shared libraries to link, so all
     # regular options will be left of them.
     supports_param_files = 1,
+    toolchain_config = ":cc-compiler-local-config",
     toolchain_identifier = "local_linux",
 )
 
+cc_toolchain_config(
+    name = "cc-compiler-local-config",
+    builtin_include_directories = [
+        "/usr/include/c++/4.8",
+        "/usr/include/x86_64-linux-gnu/c++/4.8",
+        "/usr/include/c++/4.8/backward",
+        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
+        "/usr/local/include",
+        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
+        "/usr/include/x86_64-linux-gnu",
+        "/usr/include",
+        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-10.0/include",
+        "/usr/local/cuda-10.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    cpu = "local",
+    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
+    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
+    host_compiler_prefix = "/usr/bin",
+    host_compiler_warnings = [],
+    host_unfiltered_compile_flags = [],
+    linker_bin_path = "/usr/bin",
+)
+
 cc_toolchain(
     name = "cc-compiler-darwin",
     all_files = ":crosstool_wrapper_driver_is_not_gcc",
     compiler_files = ":empty",
-    cpu = "darwin",
     dwp_files = ":empty",
     linker_files = ":crosstool_wrapper_driver_is_not_gcc",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 0,
+    toolchain_config = ":cc-compiler-local-darwin",
     toolchain_identifier = "local_darwin",
 )
 
+cc_toolchain_config(
+    name = "cc-compiler-local-darwin",
+    builtin_include_directories = [
+        "/usr/include/c++/4.8",
+        "/usr/include/x86_64-linux-gnu/c++/4.8",
+        "/usr/include/c++/4.8/backward",
+        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
+        "/usr/local/include",
+        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
+        "/usr/include/x86_64-linux-gnu",
+        "/usr/include",
+        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-10.0/include",
+        "/usr/local/cuda-10.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    cpu = "darwin",
+    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
+    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
+    host_compiler_prefix = "/usr/bin",
+    host_compiler_warnings = [],
+    host_unfiltered_compile_flags = [],
+    linker_bin_path = "/usr/bin",
+)
+
 cc_toolchain(
     name = "cc-compiler-windows",
     all_files = ":windows_msvc_wrapper_files",
     compiler_files = ":empty",
-    cpu = "x64_windows",
     dwp_files = ":empty",
     linker_files = ":windows_msvc_wrapper_files",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_config = ":cc-compiler-windows-config",
     toolchain_identifier = "local_windows",
 )
 
+cc_toolchain_config(
+    name = "cc-compiler-windows-config",
+    builtin_include_directories = [
+        "/usr/include/c++/4.8",
+        "/usr/include/x86_64-linux-gnu/c++/4.8",
+        "/usr/include/c++/4.8/backward",
+        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
+        "/usr/local/include",
+        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
+        "/usr/include/x86_64-linux-gnu",
+        "/usr/include",
+        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
+        "/usr/local/cuda-10.0/include",
+        "/usr/local/cuda-10.0/extras/CUPTI/include",
+        "/usr/include",
+    ],
+    cpu = "x64_windows",
+    msvc_cl_path = "msvc_not_used",
+    msvc_env_include = "msvc_not_used",
+    msvc_env_lib = "msvc_not_used",
+    msvc_env_path = "msvc_not_used",
+    msvc_env_tmp = "msvc_not_used",
+    msvc_lib_path = "msvc_not_used",
+    msvc_link_path = "msvc_not_used",
+    msvc_ml_path = "msvc_not_used",
+)
+
 filegroup(
     name = "empty",
     srcs = [],
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
deleted file mode 100755
index b6b87e87d2b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
+++ /dev/null
@@ -1,1431 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  feature {
-    name: "c++11"
-    flag_set {
-      action: "c++-compile"
-      flag_group {
-        flag: "-std=c++11"
-      }
-    }
-  }
-
-  feature {
-    name: "stdlib"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-lstdc++"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        flag: "-Wno-builtin-macro-redefined"
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  feature {
-    name: "alwayslink"
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-Wl,-no-as-needed"
-      }
-    }
-  }
-
-  # This feature will be enabled for builds that support pic by bazel.
-  feature {
-    name: "pic"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        expand_if_all_available: "pic"
-        flag: "-fPIC"
-      }
-      flag_group {
-        expand_if_none_available: "pic"
-        flag: "-fPIE"
-      }
-    }
-  }
-
-  # Security hardening on by default.
-  feature {
-    name: "hardening"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-        # We need to undef it before redefining it as some distributions now
-        # have it enabled by default.
-        flag: "-U_FORTIFY_SOURCE"
-        flag: "-D_FORTIFY_SOURCE=1"
-        flag: "-fstack-protector"
-      }
-    }
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-Wl,-z,relro,-z,now"
-      }
-    }
-    flag_set {
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-pie"
-        flag: "-Wl,-z,relro,-z,now"
-      }
-    }
-  }
-
-  feature {
-    name: "warnings"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # All warnings are enabled. Maybe enable -Werror as well?
-        flag: "-Wall"
-        
-      }
-    }
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  feature {
-    name: "frame-pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-fno-omit-frame-pointer"
-      }
-    }
-  }
-
-  feature {
-    name: "build-id"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        # Stamp the binary with a unique identifier.
-        flag: "-Wl,--build-id=md5"
-        flag: "-Wl,--hash-style=gnu"
-      }
-    }
-  }
-
-  feature {
-    name: "no-canonical-prefixes"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-no-canonical-prefixes"
-        flag: "-fno-canonical-system-headers"
-      }
-    }
-  }
-
-  feature {
-    name: "disable-assertions"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "linker-bin-path"
-
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-B/usr/bin"
-      }
-    }
-  }
-
-  feature {
-    name: "common"
-    implies: "stdlib"
-    implies: "c++11"
-    implies: "determinism"
-    implies: "alwayslink"
-    implies: "hardening"
-    implies: "warnings"
-    implies: "frame-pointer"
-    implies: "build-id"
-    implies: "no-canonical-prefixes"
-    implies: "linker-bin-path"
-  }
-
-  feature {
-    name: "opt"
-    implies: "common"
-    implies: "disable-assertions"
-
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # No debug symbols.
-        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
-        # or even generally? However, that can't happen here, as it requires
-        # special handling in Bazel.
-        flag: "-g0"
-
-        # Conservative choice for -O
-        # -O3 can increase binary size and even slow down the resulting binaries.
-        # Profile first and / or use FDO if you need better performance than this.
-        flag: "-O2"
-
-        # Removal of unused code and data at link time (can this increase binary size in some cases?).
-        flag: "-ffunction-sections"
-        flag: "-fdata-sections"
-      }
-    }
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-Wl,--gc-sections"
-      }
-    }
-  }
-
-  feature {
-    name: "fastbuild"
-    implies: "common"
-  }
-
-  feature {
-    name: "dbg"
-    implies: "common"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-g"
-      }
-    }
-  }
-
-  # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-
-  # Use the default system toolchain for everything else.
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Enabled dynamic linking.
-  linking_mode_flags { mode: DYNAMIC }
-
-  cxx_builtin_include_directory: "/usr/include/c++/4.8"
-  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
-  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
-  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
-  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
-  cxx_builtin_include_directory: "/usr/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
-  cxx_builtin_include_directory: "/usr/include"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-  feature {
-    name: "c++11"
-    flag_set {
-      action: "c++-compile"
-      flag_group {
-        flag: "-std=c++11"
-      }
-    }
-  }
-
-  feature {
-    name: "stdlib"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-lc++"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        flag: "-Wno-builtin-macro-redefined"
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  # This feature will be enabled for builds that support pic by bazel.
-  feature {
-    name: "pic"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        expand_if_all_available: "pic"
-        flag: "-fPIC"
-      }
-      flag_group {
-        expand_if_none_available: "pic"
-        flag: "-fPIE"
-      }
-    }
-  }
-
-  # Security hardening on by default.
-  feature {
-    name: "hardening"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-        # We need to undef it before redefining it as some distributions now
-        # have it enabled by default.
-        flag: "-U_FORTIFY_SOURCE"
-        flag: "-D_FORTIFY_SOURCE=1"
-        flag: "-fstack-protector"
-      }
-    }
-    flag_set {
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-pie"
-      }
-    }
-  }
-
-  feature {
-    name: "warnings"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # All warnings are enabled. Maybe enable -Werror as well?
-        flag: "-Wall"
-        
-      }
-    }
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  feature {
-    name: "frame-pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-fno-omit-frame-pointer"
-      }
-    }
-  }
-
-  feature {
-    name: "no-canonical-prefixes"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag:"-no-canonical-prefixes"
-      }
-    }
-  }
-
-  feature {
-    name: "disable-assertions"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "linker-bin-path"
-
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-B/usr/bin"
-      }
-    }
-  }
-
-  feature {
-    name: "undefined-dynamic"
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-undefined"
-        flag: "dynamic_lookup"
-      }
-    }
-  }
-
-  feature {
-    name: "common"
-    implies: "stdlib"
-    implies: "c++11"
-    implies: "determinism"
-    implies: "hardening"
-    implies: "warnings"
-    implies: "frame-pointer"
-    implies: "no-canonical-prefixes"
-    implies: "linker-bin-path"
-    implies: "undefined-dynamic"
-  }
-
-  feature {
-    name: "opt"
-    implies: "common"
-    implies: "disable-assertions"
-
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # No debug symbols.
-        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
-        # or even generally? However, that can't happen here, as it requires
-        # special handling in Bazel.
-        flag: "-g0"
-
-        # Conservative choice for -O
-        # -O3 can increase binary size and even slow down the resulting binaries.
-        # Profile first and / or use FDO if you need better performance than this.
-        flag: "-O2"
-
-        # Removal of unused code and data at link time (can this increase binary size in some cases?).
-        flag: "-ffunction-sections"
-        flag: "-fdata-sections"
-      }
-    }
-  }
-
-  feature {
-    name: "fastbuild"
-    implies: "common"
-  }
-
-  feature {
-    name: "dbg"
-    implies: "common"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-g"
-      }
-    }
-  }
-
-  # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-
-  # Use the default system toolchain for everything else.
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Enabled dynamic linking.
-  linking_mode_flags { mode: DYNAMIC }
-
-  cxx_builtin_include_directory: "/usr/include/c++/4.8"
-  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
-  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
-  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
-  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
-  cxx_builtin_include_directory: "/usr/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
-  cxx_builtin_include_directory: "/usr/include"
-}
-
-toolchain {
-  toolchain_identifier: "local_windows"
-  host_system_name: "local"
-  target_system_name: "local"
-
-  abi_version: "local"
-  abi_libc_version: "local"
-  target_cpu: "x64_windows"
-  compiler: "msvc-cl"
-  target_libc: "msvcrt"
-
-
-
-  tool_path {
-    name: "ar"
-    path: ""
-  }
-  tool_path {
-    name: "ml"
-    path: ""
-  }
-  tool_path {
-    name: "cpp"
-    path: ""
-  }
-  tool_path {
-    name: "gcc"
-    path: ""
-  }
-  tool_path {
-    name: "gcov"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "ld"
-    path: ""
-  }
-  tool_path {
-    name: "nm"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objcopy"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objdump"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "strip"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  supports_interface_shared_objects: true
-
-  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
-  compiler_flag: "/DCOMPILER_MSVC"
-
-  # Don't define min/max macros in windows.h.
-  compiler_flag: "/DNOMINMAX"
-
-  # Platform defines.
-  compiler_flag: "/D_WIN32_WINNT=0x0600"
-  # Turn off warning messages.
-  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
-  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
-  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
-
-  # Useful options to have on for compilation.
-  # Increase the capacity of object files to 2^32 sections.
-  compiler_flag: "/bigobj"
-  # Allocate 500MB for precomputed headers.
-  compiler_flag: "/Zm500"
-  # Use unsigned char by default.
-  compiler_flag: "/J"
-  # Use function level linking.
-  compiler_flag: "/Gy"
-  # Use string pooling.
-  compiler_flag: "/GF"
-  # Catch C++ exceptions only and tell the compiler to assume that functions declared
-  # as extern "C" never throw a C++ exception.
-  compiler_flag: "/EHsc"
-
-  # Globally disabled warnings.
-  # Don't warn about elements of array being be default initialized.
-  compiler_flag: "/wd4351"
-  # Don't warn about no matching delete found.
-  compiler_flag: "/wd4291"
-  # Don't warn about diamond inheritance patterns.
-  compiler_flag: "/wd4250"
-  # Don't warn about insecure functions (e.g. non _s functions).
-  compiler_flag: "/wd4996"
-
-  linker_flag: "/MACHINE:X64"
-
-  feature {
-    name: "no_legacy_features"
-  }
-
-  # TODO(klimek): Previously we were using a .bat file to start python to run
-  # the python script that can redirect to nvcc - unfortunately .bat files
-  # have a rather short maximum length for command lines (8k). Instead, we
-  # now use the python binary as the compiler and pass the python script to
-  # it at the start of the command line. Investigate different possibilities
-  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
-  # a small C++ wrapper to redirect.
-  feature {
-    name: "redirector"
-    enabled: true
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      flag_group {
-        flag: "-B"
-        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
-      }
-    }
-  }
-
-  # Suppress startup banner.
-  feature {
-    name: "nologo"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      flag_group {
-        flag: "/nologo"
-      }
-    }
-  }
-
-  feature {
-    name: 'has_configured_linker_path'
-  }
-
-  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
-  feature {
-    name: 'no_stripping'
-  }
-
-  # This feature indicates this is a toolchain targeting Windows.
-  feature {
-    name: 'targets_windows'
-    implies: 'copy_dynamic_libraries_to_binary'
-    enabled: true
-  }
-
-  feature {
-    name: 'copy_dynamic_libraries_to_binary'
-  }
-
-  action_config {
-    config_name: 'assemble'
-    action_name: 'assemble'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'preprocess-assemble'
-    action_name: 'preprocess-assemble'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'c-compile'
-    action_name: 'c-compile'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-compile'
-    action_name: 'c++-compile'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-link-executable'
-    action_name: 'c++-link-executable'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-  }
-
-  action_config {
-    config_name: 'c++-link-dynamic-library'
-    action_name: 'c++-link-dynamic-library'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'shared_flag'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-    implies: 'has_configured_linker_path'
-    implies: 'def_file'
-  }
-
-  action_config {
-      config_name: 'c++-link-nodeps-dynamic-library'
-      action_name: 'c++-link-nodeps-dynamic-library'
-      tool {
-        tool_path: ''
-      }
-      implies: 'nologo'
-      implies: 'shared_flag'
-      implies: 'linkstamps'
-      implies: 'output_execpath_flags'
-      implies: 'input_param_flags'
-      implies: 'user_link_flags'
-      implies: 'legacy_link_flags'
-      implies: 'linker_subsystem_flag'
-      implies: 'linker_param_file'
-      implies: 'msvc_env'
-      implies: 'no_stripping'
-      implies: 'has_configured_linker_path'
-      implies: 'def_file'
-    }
-
-  action_config {
-    config_name: 'c++-link-static-library'
-    action_name: 'c++-link-static-library'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'archiver_flags'
-    implies: 'input_param_flags'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-  }
-
-  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
-  # not used in this crosstool
-  feature {
-    name: 'legacy_compile_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'legacy_compile_flags'
-        flag: '%{legacy_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: "msvc_env"
-    env_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      env_entry {
-        key: "PATH"
-        value: ""
-      }
-      env_entry {
-        key: "INCLUDE"
-        value: ""
-      }
-      env_entry {
-        key: "LIB"
-        value: ""
-      }
-      env_entry {
-        key: "TMP"
-        value: ""
-      }
-      env_entry {
-        key: "TEMP"
-        value: ""
-      }
-    }
-  }
-
-  feature {
-    name: 'include_paths'
-    flag_set {
-      action: "assemble"
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      flag_group {
-        iterate_over: 'quote_include_paths'
-        flag: '/I%{quote_include_paths}'
-      }
-      flag_group {
-        iterate_over: 'include_paths'
-        flag: '/I%{include_paths}'
-      }
-      flag_group {
-        iterate_over: 'system_include_paths'
-        flag: '/I%{system_include_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: "preprocessor_defines"
-    flag_set {
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-header-parsing"
-      action: "c++-module-compile"
-      flag_group {
-        flag: "/D%{preprocessor_defines}"
-        iterate_over: "preprocessor_defines"
-      }
-    }
-  }
-
-  # Tell Bazel to parse the output of /showIncludes
-  feature {
-    name: 'parse_showincludes'
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-module-compile'
-      action: 'c++-header-parsing'
-      flag_group {
-        flag: "/showIncludes"
-      }
-    }
-  }
-
-
-  feature {
-    name: 'generate_pdb_file'
-    requires: {
-      feature: 'dbg'
-    }
-    requires: {
-      feature: 'fastbuild'
-    }
-  }
-
-  feature {
-    name: 'shared_flag'
-    flag_set {
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/DLL'
-      }
-    }
-  }
-
-  feature {
-    name: 'linkstamps'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      expand_if_all_available: 'linkstamp_paths'
-      flag_group {
-        iterate_over: 'linkstamp_paths'
-        flag: '%{linkstamp_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: 'output_execpath_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'archiver_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'input_param_flags'
-    flag_set {
-      expand_if_all_available: 'interface_library_output_path'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/IMPLIB:%{interface_library_output_path}"
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libopts'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'libopts'
-        flag: '%{libopts}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libraries_to_link'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        iterate_over: 'libraries_to_link'
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file_group'
-          }
-          iterate_over: 'libraries_to_link.object_files'
-          flag_group {
-            flag: '%{libraries_to_link.object_files}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'interface_library'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'static_library'
-          }
-          flag_group {
-            expand_if_false: 'libraries_to_link.is_whole_archive'
-            flag: '%{libraries_to_link.name}'
-          }
-          flag_group {
-            expand_if_true: 'libraries_to_link.is_whole_archive'
-            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
-          }
-        }
-      }
-    }
-  }
-
-  # Since this feature is declared earlier in the CROSSTOOL than
-  # "user_link_flags", this feature will be applied prior to it anwyhere they
-  # are both implied. And since "user_link_flags" contains the linkopts from
-  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
-  # file.
-  feature {
-    name: 'linker_subsystem_flag'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/SUBSYSTEM:CONSOLE'
-      }
-    }
-  }
-
-  # The "user_link_flags" contains user-defined linkopts (from build rules)
-  # so it should be defined after features that declare user-overridable flags.
-  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
-  # but we want to let the user override it, therefore "link_flag_subsystem" is
-  # defined earlier in the CROSSTOOL file than "user_link_flags".
-  feature {
-    name: 'user_link_flags'
-    flag_set {
-      expand_if_all_available: 'user_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'user_link_flags'
-        flag: '%{user_link_flags}'
-      }
-    }
-  }
-  feature {
-    name: 'legacy_link_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'legacy_link_flags'
-        flag: '%{legacy_link_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'linker_param_file'
-    flag_set {
-      expand_if_all_available: 'linker_param_file'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '@%{linker_param_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'static_link_msvcrt'
-  }
-
-  feature {
-    name: 'static_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MT"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MD"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'static_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MTd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MDd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dbg'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-        flag: "/DDEBUG"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FULL"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'fastbuild'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-        flag: "/DDEBUG"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FASTLINK"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'opt'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/O2"
-        flag: "/DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: 'user_compile_flags'
-    flag_set {
-      expand_if_all_available: 'user_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'user_compile_flags'
-        flag: '%{user_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'sysroot'
-    flag_set {
-      expand_if_all_available: 'sysroot'
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'sysroot'
-        flag: '--sysroot=%{sysroot}'
-      }
-    }
-  }
-
-  feature {
-    name: 'unfiltered_compile_flags'
-    flag_set {
-      expand_if_all_available: 'unfiltered_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'unfiltered_compile_flags'
-        flag: '%{unfiltered_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_output_flags'
-    flag_set {
-      action: 'assemble'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-        flag: '/Zi'
-      }
-    }
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_assembly_file'
-        flag: '/Fa%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_preprocess_file'
-        flag: '/P'
-        flag: '/Fi%{output_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_input_flags'
-    flag_set {
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'source_file'
-        flag: '/c'
-        flag: '%{source_file}'
-      }
-    }
-  }
-
-  feature {
-    name : 'def_file',
-    flag_set {
-      expand_if_all_available: 'def_file_path'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEF:%{def_file_path}"
-        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
-        # the warning message about DLL name doesn't match the default one.
-        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
-        flag: "/ignore:4070"
-      }
-    }
-  }
-
-  feature {
-    name: 'windows_export_all_symbols'
-  }
-
-  feature {
-    name: 'no_windows_export_all_symbols'
-  }
-
-  linking_mode_flags { mode: DYNAMIC }
-}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
new file mode 100755
index 00000000000..f7575bbe28e
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
@@ -0,0 +1,1485 @@
+"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+)
+load(
+    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
+    "ASSEMBLE_ACTION_NAME",
+    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
+    "CLIF_MATCH_ACTION_NAME",
+    "CPP_COMPILE_ACTION_NAME",
+    "CPP_HEADER_PARSING_ACTION_NAME",
+    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_EXECUTABLE_ACTION_NAME",
+    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
+    "CPP_MODULE_CODEGEN_ACTION_NAME",
+    "CPP_MODULE_COMPILE_ACTION_NAME",
+    "C_COMPILE_ACTION_NAME",
+    "LINKSTAMP_COMPILE_ACTION_NAME",
+    "LTO_BACKEND_ACTION_NAME",
+    "LTO_INDEXING_ACTION_NAME",
+    "OBJCPP_COMPILE_ACTION_NAME",
+    "OBJCPP_EXECUTABLE_ACTION_NAME",
+    "OBJC_ARCHIVE_ACTION_NAME",
+    "OBJC_COMPILE_ACTION_NAME",
+    "OBJC_EXECUTABLE_ACTION_NAME",
+    "OBJC_FULLY_LINK_ACTION_NAME",
+    "PREPROCESS_ASSEMBLE_ACTION_NAME",
+    "STRIP_ACTION_NAME",
+)
+
+ACTION_NAMES = struct(
+    c_compile = C_COMPILE_ACTION_NAME,
+    cpp_compile = CPP_COMPILE_ACTION_NAME,
+    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
+    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
+    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
+    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
+    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
+    assemble = ASSEMBLE_ACTION_NAME,
+    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
+    lto_indexing = LTO_INDEXING_ACTION_NAME,
+    lto_backend = LTO_BACKEND_ACTION_NAME,
+    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
+    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
+    strip = STRIP_ACTION_NAME,
+    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
+    objc_compile = OBJC_COMPILE_ACTION_NAME,
+    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
+    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
+    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
+    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
+    clif_match = CLIF_MATCH_ACTION_NAME,
+    objcopy_embed_data = "objcopy_embed_data",
+    ld_embed_data = "ld_embed_data",
+)
+
+def _impl(ctx):
+    if (ctx.attr.cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+    elif (ctx.attr.cpu == "local"):
+        toolchain_identifier = "local_linux"
+    elif (ctx.attr.cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+    else:
+        fail("Unreachable")
+
+    host_system_name = "local"
+
+    target_system_name = "local"
+
+    if (ctx.attr.cpu == "darwin"):
+        target_cpu = "darwin"
+    elif (ctx.attr.cpu == "local"):
+        target_cpu = "local"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_cpu = "x64_windows"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "local"):
+        target_libc = "local"
+    elif (ctx.attr.cpu == "darwin"):
+        target_libc = "macosx"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_libc = "msvcrt"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        compiler = "compiler"
+    elif (ctx.attr.cpu == "x64_windows"):
+        compiler = "msvc-cl"
+    else:
+        fail("Unreachable")
+
+    abi_version = "local"
+
+    abi_libc_version = "local"
+
+    cc_target_os = None
+
+    builtin_sysroot = None
+
+    all_link_actions = [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = ctx.attr.msvc_lib_path)],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        action_configs = []
+    elif (ctx.attr.cpu == "x64_windows"):
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        fail("Unreachable")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    pic_feature = feature(
+        name = "pic",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
+                    flag_group(
+                        flags = ["-fPIE"],
+                        expand_if_not_available = "pic",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = ([
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ctx.attr.host_unfiltered_compile_flags,
+                    ),
+                ],
+            ),
+        ] if ctx.attr.host_unfiltered_compile_flags else []),
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-Wno-builtin-macro-redefined",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie"])],
+                ),
+            ],
+        )
+    else:
+        hardening_feature = None
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                    env_entry(
+                        key = "INCLUDE",
+                        value = ctx.attr.msvc_env_include,
+                    ),
+                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                ],
+            ),
+        ],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    warnings_feature = feature(
+        name = "warnings",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0600",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/J",
+                            "/Gy",
+                            "/GF",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g"])],
+                ),
+            ],
+            implies = ["common"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    else:
+        dbg_feature = None
+
+    undefined_dynamic_feature = feature(
+        name = "undefined-dynamic",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+            ),
+        ],
+    )
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable-assertions",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "x64_windows"):
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    elif (ctx.attr.cpu == "darwin" or
+          ctx.attr.cpu == "local"):
+        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
+    else:
+        fastbuild_feature = None
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    redirector_feature = feature(
+        name = "redirector",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-B",
+                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linker_bin_path_feature = feature(
+        name = "linker-bin-path",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_executable,
+                    ],
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                ),
+            ],
+        )
+    else:
+        opt_feature = None
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    frame_pointer_feature = feature(
+        name = "frame-pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
+            ),
+        ],
+    )
+
+    build_id_feature = feature(
+        name = "build-id",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "darwin"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lc++"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "local"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+    else:
+        stdlib_feature = None
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    alwayslink_feature = feature(
+        name = "alwayslink",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
+            ),
+        ],
+    )
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-no-canonical-prefixes",
+                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
+                ),
+            ],
+        )
+    else:
+        no_canonical_prefixes_feature = None
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    cpp11_feature = feature(
+        name = "c++11",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-std=c++11"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "alwayslink",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "build-id",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+                "undefined-dynamic",
+            ],
+        )
+    else:
+        common_feature = None
+
+    if (ctx.attr.cpu == "local"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            alwayslink_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            build_id_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+    elif (ctx.attr.cpu == "darwin"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            undefined_dynamic_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+    elif (ctx.attr.cpu == "x64_windows"):
+        features = [
+            no_legacy_features_feature,
+            redirector_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            static_link_msvcrt_no_debug_feature,
+            dynamic_link_msvcrt_no_debug_feature,
+            static_link_msvcrt_debug_feature,
+            dynamic_link_msvcrt_debug_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        fail("Unreachable")
+
+    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
+
+    if (ctx.attr.cpu == "x64_windows"):
+        tool_paths = [
+            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
+            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
+            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
+            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(
+                name = "objcopy",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "objdump",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "strip",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+        ]
+    elif (ctx.attr.cpu == "local"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    elif (ctx.attr.cpu == "darwin"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = features,
+            action_configs = action_configs,
+            artifact_name_patterns = [],
+            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = host_system_name,
+            target_system_name = target_system_name,
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = abi_version,
+            abi_libc_version = abi_libc_version,
+            tool_paths = tool_paths,
+            make_variables = [],
+            builtin_sysroot = builtin_sysroot,
+            cc_target_os = cc_target_os,
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "msvc_cl_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_include": attr.string(default = "msvc_not_used"),
+        "msvc_env_lib": attr.string(default = "msvc_not_used"),
+        "msvc_env_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
+        "msvc_lib_path": attr.string(default = "msvc_not_used"),
+        "msvc_link_path": attr.string(default = "msvc_not_used"),
+        "msvc_ml_path": attr.string(default = "msvc_not_used"),
+    },
+    provides = [CcToolchainConfigInfo],
+    executable = True,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
index a1d69cf2e3d..5cc9abde924 100755
--- a/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
@@ -16,16 +16,14 @@
 
 package(default_visibility = ["//visibility:public"])
 
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
 licenses(["notice"])  # Apache 2.0
 
 cc_library(
     name = "malloc",
 )
 
-cc_library(
-    name = "stl",
-)
-
 filegroup(
     name = "empty",
     srcs = [],
@@ -57,16 +55,24 @@ cc_toolchain_suite(
 cc_toolchain(
     name = "cc-compiler-k8",
     all_files = ":compiler_deps",
+    ar_files = ":empty",
+    as_files = ":empty",
     compiler_files = ":compiler_deps",
-    cpu = "k8",
     dwp_files = ":empty",
     linker_files = ":compiler_deps",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_config = ":linux_gnu_x86",
     toolchain_identifier = "linux_gnu_x86",
 )
 
+cc_toolchain_config(
+    name = "linux_gnu_x86",
+    compiler = "clang",
+    cpu = "k8",
+)
+
 toolchain(
     name = "cc-toolchain-k8",
     exec_compatible_with = [
@@ -83,16 +89,24 @@ toolchain(
 cc_toolchain(
     name = "cc-compiler-armeabi-v7a",
     all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
     compiler_files = ":empty",
-    cpu = "local",
     dwp_files = ":empty",
     linker_files = ":empty",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
     toolchain_identifier = "stub_armeabi-v7a",
 )
 
+cc_toolchain_config(
+    name = "stub_armeabi-v7a",
+    compiler = "compiler",
+    cpu = "armeabi-v7a",
+)
+
 toolchain(
     name = "cc-toolchain-armeabi-v7a",
     exec_compatible_with = [
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL
deleted file mode 100755
index 48f82eb35d5..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL
+++ /dev/null
@@ -1,1209 +0,0 @@
-# Copyright 2016 The Bazel Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-major_version: "local"
-minor_version: ""
-
-# Android tooling requires a default toolchain for the armeabi-v7a cpu.
-toolchain {
-  abi_version: "armeabi-v7a"
-  abi_libc_version: "armeabi-v7a"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "armeabi-v7a"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  target_libc: "armeabi-v7a"
-  target_cpu: "armeabi-v7a"
-  target_system_name: "armeabi-v7a"
-  toolchain_identifier: "stub_armeabi-v7a"
-
-  tool_path { name: "ar" path: "/bin/false" }
-  tool_path { name: "compat-ld" path: "/bin/false" }
-  tool_path { name: "cpp" path: "/bin/false" }
-  tool_path { name: "dwp" path: "/bin/false" }
-  tool_path { name: "gcc" path: "/bin/false" }
-  tool_path { name: "gcov" path: "/bin/false" }
-  tool_path { name: "ld" path: "/bin/false" }
-
-  tool_path { name: "nm" path: "/bin/false" }
-  tool_path { name: "objcopy" path: "/bin/false" }
-  tool_path { name: "objdump" path: "/bin/false" }
-  tool_path { name: "strip" path: "/bin/false" }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  toolchain_identifier: "linux_gnu_x86"
-  abi_version: "gcc"
-  abi_libc_version: "glibc_2.19"
-  builtin_sysroot: ""
-  compiler: "clang"
-  host_system_name: "i686-unknown-linux-gnu"
-  needsPic: true
-  supports_gold_linker: true
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: true
-  target_libc: "glibc_2.19"
-  target_cpu: "k8"
-  target_system_name: "x86_64-unknown-linux-gnu"
-  cxx_flag: "-std=c++0x"
-  linker_flag: "-fuse-ld=gold"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-Wl,-z,relro,-z,now"
-  linker_flag: "-B/usr/local/bin"
-  linker_flag: "-lstdc++"
-  linker_flag: "-lm"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/local/lib/clang/7.0.0/include"
-  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
-  cxx_builtin_include_directory: "/usr/include"
-  cxx_builtin_include_directory: "/usr/include/c++/4.9"
-  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.9"
-  cxx_builtin_include_directory: "/usr/include/c++/4.9/backward"
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-  compiler_flag: "-U_FORTIFY_SOURCE"
-  compiler_flag: "-fstack-protector"
-  compiler_flag: "-Wall"
-  compiler_flag: "-Wthread-safety"
-  compiler_flag: "-Wself-assign"
-  compiler_flag: "-fcolor-diagnostics"
-  compiler_flag: "-fno-omit-frame-pointer"
-  tool_path {name: "ar" path: "/usr/bin/ar" }
-  tool_path {name: "ld" path: "/usr/bin/ld" }
-  tool_path {name: "cpp" path: "/usr/bin/cpp" }
-  tool_path {name: "gcc" path: "/usr/local/bin/clang" }
-  tool_path {name: "dwp" path: "/usr/bin/dwp" }
-  tool_path {name: "gcov" path: "None" }
-  tool_path {name: "nm" path: "/usr/bin/nm" }
-  tool_path {name: "objcopy" path: "/usr/bin/objcopy" }
-  tool_path {name: "objdump" path: "/usr/bin/objdump" }
-  tool_path {name: "strip" path: "/usr/bin/strip" }
-
-  compilation_mode_flags {
-    mode: DBG
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-    compiler_flag: "-g0"
-    compiler_flag: "-O2"
-    compiler_flag: "-D_FORTIFY_SOURCE=1"
-    compiler_flag: "-DNDEBUG"
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-
-
-    feature {
-      name: 'coverage'
-      provides: 'profile'
-      flag_set {
-        action: 'preprocess-assemble'
-        action: 'c-compile'
-        action: 'c++-compile'
-        action: 'c++-header-parsing'
-        action: 'c++-module-compile'
-        flag_group {
-        flag: '--coverage'
-      }
-      }
-      flag_set {
-        action: 'c++-link-dynamic-library'
-        action: 'c++-link-nodeps-dynamic-library'
-        action: 'c++-link-executable'
-        flag_group {
-        flag: '--coverage'
-      }
-      }
-    }
-  
-
-  feature {
-    name: 'fdo_optimize'
-    provides: 'profile'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      expand_if_all_available: 'fdo_profile_path'
-      flag_group {
-        flag: '-fprofile-use=%{fdo_profile_path}'
-        flag: '-fprofile-correction',
-      }
-    }
-  }
-}
-
-toolchain {
-  toolchain_identifier: "msys_x64_mingw"
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "mingw-gcc"
-  host_system_name: "local"
-  needsPic: false
-  target_libc: "mingw"
-  target_cpu: "x64_windows"
-  target_system_name: "local"
-
-  artifact_name_pattern {
-     category_name: 'executable'
-     prefix: ''
-     extension: '.exe'
-  }
-
-
-
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  toolchain_identifier: "msvc_x64"
-  host_system_name: "local"
-  target_system_name: "local"
-
-  abi_version: "local"
-  abi_libc_version: "local"
-  target_cpu: "x64_windows"
-  compiler: "msvc-cl"
-  target_libc: "msvcrt"
-  default_python_version: "python2.7"
-
-
-
-  tool_path {
-    name: "ar"
-    path: ""
-  }
-  tool_path {
-    name: "ml"
-    path: ""
-  }
-  tool_path {
-    name: "cpp"
-    path: ""
-  }
-  tool_path {
-    name: "gcc"
-    path: ""
-  }
-  tool_path {
-    name: "gcov"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "ld"
-    path: ""
-  }
-  tool_path {
-    name: "nm"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objcopy"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objdump"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "strip"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  supports_gold_linker: false
-  supports_start_end_lib: false
-  supports_interface_shared_objects: true
-  supports_incremental_linker: false
-  supports_normalizing_ar: true
-  needsPic: false
-
-  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
-  compiler_flag: "/DCOMPILER_MSVC"
-
-  # Don't define min/max macros in windows.h.
-  compiler_flag: "/DNOMINMAX"
-
-  # Platform defines.
-  compiler_flag: "/D_WIN32_WINNT=0x0601"
-  # Turn off warning messages.
-  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
-  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
-
-  # Useful options to have on for compilation.
-  # Increase the capacity of object files to 2^32 sections.
-  compiler_flag: "/bigobj"
-  # Allocate 500MB for precomputed headers.
-  compiler_flag: "/Zm500"
-  # Catch C++ exceptions only and tell the compiler to assume that functions declared
-  # as extern "C" never throw a C++ exception.
-  compiler_flag: "/EHsc"
-
-  # Globally disabled warnings.
-  # Don't warn about elements of array being be default initialized.
-  compiler_flag: "/wd4351"
-  # Don't warn about no matching delete found.
-  compiler_flag: "/wd4291"
-  # Don't warn about diamond inheritance patterns.
-  compiler_flag: "/wd4250"
-  # Don't warn about insecure functions (e.g. non _s functions).
-  compiler_flag: "/wd4996"
-
-  linker_flag: "/MACHINE:X64"
-
-  feature {
-    name: "no_legacy_features"
-  }
-
-  artifact_name_pattern {
-     category_name: 'object_file'
-     prefix: ''
-     extension: '.obj'
-  }
-
-  artifact_name_pattern {
-     category_name: 'static_library'
-     prefix: ''
-     extension: '.lib'
-  }
-
-  artifact_name_pattern {
-     category_name: 'alwayslink_static_library'
-     prefix: ''
-     extension: '.lo.lib'
-  }
-
-  artifact_name_pattern {
-     category_name: 'executable'
-     prefix: ''
-     extension: '.exe'
-  }
-
-  artifact_name_pattern {
-     category_name: 'dynamic_library'
-     prefix: ''
-     extension: '.dll'
-  }
-
-  artifact_name_pattern {
-     category_name: 'interface_library'
-     prefix: ''
-     extension: '.if.lib'
-  }
-
-  # Suppress startup banner.
-  feature {
-    name: "nologo"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      flag_group {
-        flag: "/nologo"
-      }
-    }
-  }
-
-  feature {
-    name: 'has_configured_linker_path'
-  }
-
-  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
-  feature {
-    name: 'no_stripping'
-  }
-
-  # This feature indicates this is a toolchain targeting Windows.
-  feature {
-    name: 'targets_windows'
-    implies: 'copy_dynamic_libraries_to_binary'
-    enabled: true
-  }
-
-  feature {
-    name: 'copy_dynamic_libraries_to_binary'
-  }
-
-  action_config {
-    config_name: 'assemble'
-    action_name: 'assemble'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'preprocess-assemble'
-    action_name: 'preprocess-assemble'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'c-compile'
-    action_name: 'c-compile'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-compile'
-    action_name: 'c++-compile'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-link-executable'
-    action_name: 'c++-link-executable'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-  }
-
-  action_config {
-    config_name: 'c++-link-dynamic-library'
-    action_name: 'c++-link-dynamic-library'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'shared_flag'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-    implies: 'has_configured_linker_path'
-    implies: 'def_file'
-  }
-
-  action_config {
-      config_name: 'c++-link-nodeps-dynamic-library'
-      action_name: 'c++-link-nodeps-dynamic-library'
-      tool {
-        tool_path: ''
-      }
-      implies: 'nologo'
-      implies: 'shared_flag'
-      implies: 'linkstamps'
-      implies: 'output_execpath_flags'
-      implies: 'input_param_flags'
-      implies: 'user_link_flags'
-      implies: 'legacy_link_flags'
-      implies: 'linker_subsystem_flag'
-      implies: 'linker_param_file'
-      implies: 'msvc_env'
-      implies: 'no_stripping'
-      implies: 'has_configured_linker_path'
-      implies: 'def_file'
-    }
-
-  action_config {
-    config_name: 'c++-link-static-library'
-    action_name: 'c++-link-static-library'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'archiver_flags'
-    implies: 'input_param_flags'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-  }
-
-  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
-  # not used in this crosstool
-  feature {
-    name: 'legacy_compile_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'legacy_compile_flags'
-        flag: '%{legacy_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: "msvc_env"
-    env_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      env_entry {
-        key: "PATH"
-        value: ""
-      }
-      env_entry {
-        key: "TMP"
-        value: ""
-      }
-      env_entry {
-        key: "TEMP"
-        value: ""
-      }
-    }
-    implies: 'msvc_compile_env'
-    implies: 'msvc_link_env'
-  }
-
-  feature {
-    name: "msvc_compile_env"
-    env_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      env_entry {
-        key: "INCLUDE"
-        value: ""
-      }
-    }
-  }
-
-  feature {
-    name: "msvc_link_env"
-    env_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      env_entry {
-        key: "LIB"
-        value: ""
-      }
-    }
-  }
-
-  feature {
-    name: 'include_paths'
-    flag_set {
-      action: "assemble"
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      flag_group {
-        iterate_over: 'quote_include_paths'
-        flag: '/I%{quote_include_paths}'
-      }
-      flag_group {
-        iterate_over: 'include_paths'
-        flag: '/I%{include_paths}'
-      }
-      flag_group {
-        iterate_over: 'system_include_paths'
-        flag: '/I%{system_include_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: "preprocessor_defines"
-    flag_set {
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-header-parsing"
-      action: "c++-module-compile"
-      flag_group {
-        flag: "/D%{preprocessor_defines}"
-        iterate_over: "preprocessor_defines"
-      }
-    }
-  }
-
-  # Tell Bazel to parse the output of /showIncludes
-  feature {
-    name: 'parse_showincludes'
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-module-compile'
-      action: 'c++-header-parsing'
-      flag_group {
-        flag: "/showIncludes"
-      }
-    }
-  }
-
-
-  feature {
-    name: 'generate_pdb_file'
-    requires: {
-      feature: 'dbg'
-    }
-    requires: {
-      feature: 'fastbuild'
-    }
-  }
-
-  feature {
-    name: 'shared_flag'
-    flag_set {
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/DLL'
-      }
-    }
-  }
-
-  feature {
-    name: 'linkstamps'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      expand_if_all_available: 'linkstamp_paths'
-      flag_group {
-        iterate_over: 'linkstamp_paths'
-        flag: '%{linkstamp_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: 'output_execpath_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'archiver_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'input_param_flags'
-    flag_set {
-      expand_if_all_available: 'interface_library_output_path'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/IMPLIB:%{interface_library_output_path}"
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libopts'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'libopts'
-        flag: '%{libopts}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libraries_to_link'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        iterate_over: 'libraries_to_link'
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file_group'
-          }
-          iterate_over: 'libraries_to_link.object_files'
-          flag_group {
-            flag: '%{libraries_to_link.object_files}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'interface_library'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'static_library'
-          }
-          flag_group {
-            expand_if_false: 'libraries_to_link.is_whole_archive'
-            flag: '%{libraries_to_link.name}'
-          }
-          flag_group {
-            expand_if_true: 'libraries_to_link.is_whole_archive'
-            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
-          }
-        }
-      }
-    }
-  }
-
-  # Since this feature is declared earlier in the CROSSTOOL than
-  # "user_link_flags", this feature will be applied prior to it anwyhere they
-  # are both implied. And since "user_link_flags" contains the linkopts from
-  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
-  # file.
-  feature {
-    name: 'linker_subsystem_flag'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/SUBSYSTEM:CONSOLE'
-      }
-    }
-  }
-
-  # The "user_link_flags" contains user-defined linkopts (from build rules)
-  # so it should be defined after features that declare user-overridable flags.
-  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
-  # but we want to let the user override it, therefore "link_flag_subsystem" is
-  # defined earlier in the CROSSTOOL file than "user_link_flags".
-  feature {
-    name: 'user_link_flags'
-    flag_set {
-      expand_if_all_available: 'user_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'user_link_flags'
-        flag: '%{user_link_flags}'
-      }
-    }
-  }
-  feature {
-    name: 'legacy_link_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'legacy_link_flags'
-        flag: '%{legacy_link_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'linker_param_file'
-    flag_set {
-      expand_if_all_available: 'linker_param_file'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '@%{linker_param_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'static_link_msvcrt'
-  }
-
-  feature {
-    name: 'static_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MT"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MD"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'static_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MTd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MDd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dbg'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: ""
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'fastbuild'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: ""
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'opt'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/O2" # Implies /Og /Oi /Ot /Oy /Ob2 /Gs /GF /Gy
-      }
-    }
-    implies: 'frame_pointer'
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  # Must come after /O1, /O2 and /Ox.
-  feature {
-    name: "frame_pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "/Oy-"
-      }
-    }
-  }
-
-  # Remove assert/DCHECKs in opt mode.
-  # You can have them back with --features=-disable_assertions.
-  feature {
-    name: 'disable_assertions'
-    enabled: true
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      with_feature: {
-        feature: 'opt'
-      }
-      flag_group {
-        flag: "/DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    enabled: true
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        # TODO: detect clang on Windows and use "-Wno-builtin-macro-redefined"
-        flag: "/wd4117" # Trying to define or undefine a predefined macro
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  feature {
-    name: 'treat_warnings_as_errors'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/WX"
-      }
-    }
-  }
-
-  # Trade slower build time for smaller binary
-  feature {
-    name: 'smaller_binary'
-    enabled: true
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      with_feature: {
-        feature: 'opt'
-      }
-      flag_group {
-        flag: "/Gy" # Enable function-level linking (-ffunction-sections)
-        flag: "/Gw" # Optimize global data (-fdata-sections)
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library',
-      action: 'c++-link-nodeps-dynamic-library'
-      with_feature: {
-        feature: 'opt'
-      }
-      flag_group {
-        flag: '/OPT:ICF' # Fold identical functions
-        flag: '/OPT:REF' # Eliminate unreferenced functions and data
-      }
-    }
-  }
-
-  # Suppress warnings that most users do not care
-  feature {
-    name: 'ignore_noisy_warnings'
-    enabled: true
-    flag_set {
-      action: 'c++-link-static-library'
-      flag_group {
-        # Suppress 'object file does not define any public symbols' warning
-        flag: '/ignore:4221'
-      }
-    }
-  }
-
-  feature {
-    name: 'user_compile_flags'
-    flag_set {
-      expand_if_all_available: 'user_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'user_compile_flags'
-        flag: '%{user_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'sysroot'
-    flag_set {
-      expand_if_all_available: 'sysroot'
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'sysroot'
-        flag: '--sysroot=%{sysroot}'
-      }
-    }
-  }
-
-  feature {
-    name: 'unfiltered_compile_flags'
-    flag_set {
-      expand_if_all_available: 'unfiltered_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'unfiltered_compile_flags'
-        flag: '%{unfiltered_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_output_flags'
-    flag_set {
-      action: 'assemble'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-        flag: '/Zi'
-      }
-    }
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_assembly_file'
-        flag: '/Fa%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_preprocess_file'
-        flag: '/P'
-        flag: '/Fi%{output_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_input_flags'
-    flag_set {
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'source_file'
-        flag: '/c'
-        flag: '%{source_file}'
-      }
-    }
-  }
-
-  feature {
-    name : 'def_file',
-    flag_set {
-      expand_if_all_available: 'def_file_path'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEF:%{def_file_path}"
-        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
-        # the warning message about DLL name doesn't match the default one.
-        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
-        flag: "/ignore:4070"
-      }
-    }
-  }
-
-  feature {
-    name: 'windows_export_all_symbols'
-  }
-
-  feature {
-    name: 'no_windows_export_all_symbols'
-  }
-
-  linking_mode_flags { mode: DYNAMIC }
-}
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl
new file mode 100755
index 00000000000..64123e2a3e1
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl
@@ -0,0 +1,1733 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _windows_msvc_impl(ctx):
+    toolchain_identifier = "msvc_x64"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "msvcrt"
+    compiler = "msvc-cl"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+
+    cxx_builtin_include_directories = [
+        "/usr/local/include",
+        "/usr/local/lib/clang/7.0.0/include",
+        "/usr/include/x86_64-linux-gnu",
+        "/usr/include",
+        "/usr/include/c++/4.9",
+        "/usr/include/x86_64-linux-gnu/c++/4.9",
+        "/usr/include/c++/4.9/backward",
+    ]
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "")],
+    )
+
+    action_configs = [
+        assemble_action,
+        preprocess_assemble_action,
+        c_compile_action,
+        cpp_compile_action,
+        cpp_link_executable_action,
+        cpp_link_dynamic_library_action,
+        cpp_link_nodeps_dynamic_library_action,
+        cpp_link_static_library_action,
+    ]
+
+    msvc_link_env_feature = feature(
+        name = "msvc_link_env",
+        env_sets = [
+            env_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                env_entries = [env_entry(key = "LIB", value = "")],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/wd4117",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{unfiltered_compile_flags}"],
+                        iterate_over = "unfiltered_compile_flags",
+                        expand_if_available = "unfiltered_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{libopts}"],
+                        iterate_over = "libopts",
+                        expand_if_available = "libopts",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fastbuild_feature = feature(
+        name = "fastbuild",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    dbg_feature = feature(
+        name = "dbg",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    opt_feature = feature(
+        name = "opt",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/O2"])],
+            ),
+        ],
+        implies = ["frame_pointer"],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0601",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_compile_env_feature = feature(
+        name = "msvc_compile_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                env_entries = [env_entry(key = "INCLUDE", value = "")],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable_assertions",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    ignore_noisy_warnings_feature = feature(
+        name = "ignore_noisy_warnings",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [flag_group(flags = ["/ignore:4221"])],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    treat_warnings_as_errors_feature = feature(
+        name = "treat_warnings_as_errors",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/WX"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    frame_pointer_feature = feature(
+        name = "frame_pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Oy-"])],
+            ),
+        ],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_not_available = "output_preprocess_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    smaller_binary_feature = feature(
+        name = "smaller_binary",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = ""),
+                    env_entry(key = "TMP", value = ""),
+                    env_entry(key = "TEMP", value = ""),
+                ],
+            ),
+        ],
+        implies = ["msvc_compile_env", "msvc_link_env"],
+    )
+
+    features = [
+        no_legacy_features_feature,
+        nologo_feature,
+        has_configured_linker_path_feature,
+        no_stripping_feature,
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        default_compile_flags_feature,
+        msvc_env_feature,
+        msvc_compile_env_feature,
+        msvc_link_env_feature,
+        include_paths_feature,
+        preprocessor_defines_feature,
+        parse_showincludes_feature,
+        generate_pdb_file_feature,
+        shared_flag_feature,
+        linkstamps_feature,
+        output_execpath_flags_feature,
+        archiver_flags_feature,
+        input_param_flags_feature,
+        linker_subsystem_flag_feature,
+        user_link_flags_feature,
+        default_link_flags_feature,
+        linker_param_file_feature,
+        static_link_msvcrt_feature,
+        static_link_msvcrt_no_debug_feature,
+        dynamic_link_msvcrt_no_debug_feature,
+        static_link_msvcrt_debug_feature,
+        dynamic_link_msvcrt_debug_feature,
+        dbg_feature,
+        fastbuild_feature,
+        opt_feature,
+        frame_pointer_feature,
+        disable_assertions_feature,
+        determinism_feature,
+        treat_warnings_as_errors_feature,
+        smaller_binary_feature,
+        ignore_noisy_warnings_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+        compiler_output_flags_feature,
+        compiler_input_flags_feature,
+        def_file_feature,
+        windows_export_all_symbols_feature,
+        no_windows_export_all_symbols_feature,
+        supports_dynamic_linker_feature,
+        supports_interface_shared_libraries_feature,
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "object_file",
+            prefix = "",
+            extension = ".obj",
+        ),
+        artifact_name_pattern(
+            category_name = "static_library",
+            prefix = "",
+            extension = ".lib",
+        ),
+        artifact_name_pattern(
+            category_name = "alwayslink_static_library",
+            prefix = "",
+            extension = ".lo.lib",
+        ),
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+        artifact_name_pattern(
+            category_name = "dynamic_library",
+            prefix = "",
+            extension = ".dll",
+        ),
+        artifact_name_pattern(
+            category_name = "interface_library",
+            prefix = "",
+            extension = ".if.lib",
+        ),
+    ]
+
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = ""),
+        tool_path(name = "ml", path = ""),
+        tool_path(name = "cpp", path = ""),
+        tool_path(name = "gcc", path = ""),
+        tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(name = "ld", path = ""),
+        tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(
+            name = "objcopy",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "objdump",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "strip",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = None,
+    )
+
+def _windows_msys_mingw_impl(ctx):
+    toolchain_identifier = "msys_x64_mingw"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "mingw"
+    compiler = "mingw-gcc"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "NOT_USED"),
+                ],
+            ),
+        ],
+    )
+
+    msys_mingw_flags = [
+    ]
+    msys_mingw_link_flags = [
+    ]
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = msys_mingw_flags)] if msys_mingw_flags else []),
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = msys_mingw_link_flags)] if msys_mingw_link_flags else []),
+            ),
+        ],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+        default_compile_flags_feature,
+        default_link_flags_feature,
+        supports_dynamic_linker_feature,
+    ]
+
+    cxx_builtin_include_directories = [
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+    ]
+
+    make_variables = []
+    tool_paths = [
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _armeabi_impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = []
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "compat-ld", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _impl(ctx):
+    if ctx.attr.cpu == "armeabi-v7a":
+        return _armeabi_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "msvc-cl":
+        return _windows_msvc_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+        return _windows_msys_mingw_impl(ctx)
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/usr/bin/ar"),
+        tool_path(name = "ld", path = "/usr/bin/ld"),
+        tool_path(name = "cpp", path = "/usr/bin/cpp"),
+        tool_path(name = "gcc", path = "/usr/local/bin/clang"),
+        tool_path(name = "dwp", path = "/usr/bin/dwp"),
+        tool_path(name = "gcov", path = "None"),
+        tool_path(name = "nm", path = "/usr/bin/nm"),
+        tool_path(name = "objcopy", path = "/usr/bin/objcopy"),
+        tool_path(name = "objdump", path = "/usr/bin/objdump"),
+        tool_path(name = "strip", path = "/usr/bin/strip"),
+    ]
+
+    cxx_builtin_include_directories = [
+        "/usr/local/include",
+        "/usr/local/lib/clang/7.0.0/include",
+        "/usr/include/x86_64-linux-gnu",
+        "/usr/include",
+        "/usr/include/c++/4.9",
+        "/usr/include/x86_64-linux-gnu/c++/4.9",
+        "/usr/include/c++/4.9/backward",
+    ]
+
+    action_configs = []
+
+    compile_flags = [
+        "-U_FORTIFY_SOURCE",
+        "-fstack-protector",
+        "-Wall",
+        "-Wthread-safety",
+        "-Wself-assign",
+        "-fcolor-diagnostics",
+        "-fno-omit-frame-pointer",
+    ]
+
+    dbg_compile_flags = [
+        "-g",
+    ]
+
+    opt_compile_flags = [
+        "-g0",
+        "-O2",
+        "-D_FORTIFY_SOURCE=1",
+        "-DNDEBUG",
+        "-ffunction-sections",
+        "-fdata-sections",
+    ]
+
+    cxx_flags = [
+        "-std=c++0x",
+    ]
+
+    link_flags = [
+        "-fuse-ld=gold",
+        "-Wl,-no-as-needed",
+        "-Wl,-z,relro,-z,now",
+        "-B/usr/local/bin",
+        "-lstdc++",
+        "-lm",
+    ]
+
+    opt_link_flags = [
+        "-Wl,--gc-sections",
+    ]
+
+    unfiltered_compile_flags = [
+        "-no-canonical-prefixes",
+        "-Wno-builtin-macro-redefined",
+        "-D__DATE__=\"redacted\"",
+        "-D__TIMESTAMP__=\"redacted\"",
+        "-D__TIME__=\"redacted\"",
+    ]
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "NOT_USED"),
+                ],
+            ),
+        ],
+    )
+
+    windows_features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+    ]
+
+    coverage_feature = feature(
+        name = "coverage",
+        provides = ["profile"],
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(flags = ["--coverage"]),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [
+                    flag_group(flags = ["--coverage"]),
+                ],
+            ),
+        ],
+    )
+
+    supports_pic_feature = feature(
+        name = "supports_pic",
+        enabled = True,
+    )
+    supports_start_end_lib_feature = feature(
+        name = "supports_start_end_lib",
+        enabled = True,
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = compile_flags)] if compile_flags else []),
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = dbg_compile_flags)] if dbg_compile_flags else []),
+                with_features = [with_feature_set(features = ["dbg"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = opt_compile_flags)] if opt_compile_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = cxx_flags)] if cxx_flags else []),
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = link_flags)] if link_flags else []),
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = opt_link_flags)] if opt_link_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    dbg_feature = feature(name = "dbg")
+
+    opt_feature = feature(name = "opt")
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fdo_optimize_feature = feature(
+        name = "fdo_optimize",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-fprofile-use=%{fdo_profile_path}",
+                            "-fprofile-correction",
+                        ],
+                        expand_if_available = "fdo_profile_path",
+                    ),
+                ],
+            ),
+        ],
+        provides = ["profile"],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = unfiltered_compile_flags)] if unfiltered_compile_flags else []),
+            ),
+        ],
+    )
+
+    features = [
+        supports_pic_feature,
+        supports_start_end_lib_feature,
+        coverage_feature,
+        default_compile_flags_feature,
+        default_link_flags_feature,
+        fdo_optimize_feature,
+        supports_dynamic_linker_feature,
+        dbg_feature,
+        opt_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+    ]
+
+    artifact_name_patterns = [
+    ]
+
+    make_variables = []
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = "linux_gnu_x86",
+        host_system_name = "i686-unknown-linux-gnu",
+        target_system_name = "x86_64-unknown-linux-gnu",
+        target_cpu = "k8",
+        target_libc = "glibc_2.19",
+        compiler = "clang",
+        abi_version = "gcc",
+        abi_libc_version = "glibc_2.19",
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = "",
+        cc_target_os = None,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/third_party/toolchains/preconfig/win_1803/BUILD b/third_party/toolchains/preconfig/win_1803/BUILD
index ac599bc2f3d..71a10d60e62 100644
--- a/third_party/toolchains/preconfig/win_1803/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name:"container-image"
-          value:"docker://gcr.io/tensorflow-testing/tf-rbe-win@sha256:fbc5713566011cc27fc3651183a6e7c2fd56fc6f006618c53f8fc71e742feebd"
+          value:"docker://gcr.io/tensorflow-testing/tf-win-rbe@sha256:e62d7bc6b99d841f47701d2a49c01484699abf7b438a8645c6b3f0d175f0fae2"
         }
         properties:{
           name: "OSFamily" value: "Windows"
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL b/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
deleted file mode 100644
index 38a80c22da3..00000000000
--- a/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
+++ /dev/null
@@ -1,1176 +0,0 @@
-# Copyright 2016 The Bazel Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-major_version: "local"
-minor_version: ""
-
-# Android tooling requires a default toolchain for the armeabi-v7a cpu.
-toolchain {
-  abi_version: "armeabi-v7a"
-  abi_libc_version: "armeabi-v7a"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "armeabi-v7a"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  target_libc: "armeabi-v7a"
-  target_cpu: "armeabi-v7a"
-  target_system_name: "armeabi-v7a"
-  toolchain_identifier: "stub_armeabi-v7a"
-
-  tool_path { name: "ar" path: "/bin/false" }
-  tool_path { name: "compat-ld" path: "/bin/false" }
-  tool_path { name: "cpp" path: "/bin/false" }
-  tool_path { name: "dwp" path: "/bin/false" }
-  tool_path { name: "gcc" path: "/bin/false" }
-  tool_path { name: "gcov" path: "/bin/false" }
-  tool_path { name: "ld" path: "/bin/false" }
-
-  tool_path { name: "nm" path: "/bin/false" }
-  tool_path { name: "objcopy" path: "/bin/false" }
-  tool_path { name: "objdump" path: "/bin/false" }
-  tool_path { name: "strip" path: "/bin/false" }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  toolchain_identifier: "msys_x64"
-   abi_version: "local"
-   abi_libc_version: "local"
-   builtin_sysroot: ""
-   compiler: "msys-gcc"
-   host_system_name: "local"
-   needsPic: false
-   target_libc: "msys"
-   target_cpu: "x64_windows"
-   target_system_name: "local"
-   tool_path { name: "ar" path: "c:/tools/msys64/usr/bin/ar" }
-   tool_path { name: "compat-ld" path: "c:/tools/msys64/usr/bin/ld" }
-   tool_path { name: "cpp" path: "c:/tools/msys64/usr/bin/cpp" }
-   tool_path { name: "dwp" path: "c:/tools/msys64/usr/bin/dwp" }
-   tool_path { name: "gcc" path: "c:/tools/msys64/usr/bin/gcc" }
-   artifact_name_pattern { category_name: "executable" prefix: "" extension: ".exe"}
-   cxx_flag: "-std=gnu++0x"
-   linker_flag: "-lstdc++"
-   cxx_builtin_include_directory: "c:/tools/msys64/usr/"
-   tool_path { name: "gcov" path: "c:/tools/msys64/usr/bin/gcov" }
-   tool_path { name: "ld" path: "c:/tools/msys64/usr/bin/ld" }
-   tool_path { name: "nm" path: "c:/tools/msys64/usr/bin/nm" }
-   tool_path { name: "objcopy" path: "c:/tools/msys64/usr/bin/objcopy" }
-   objcopy_embed_flag: "-I"
-   objcopy_embed_flag: "binary"
-   tool_path { name: "objdump" path: "c:/tools/msys64/usr/bin/objdump" }
-   tool_path { name: "strip" path: "c:/tools/msys64/usr/bin/strip" }   feature { name: "targets_windows" implies: "copy_dynamic_libraries_to_binary" enabled: true }   feature { name: "copy_dynamic_libraries_to_binary" }
-
-  compilation_mode_flags {
-    mode: DBG
-
-  }
-  compilation_mode_flags {
-    mode: OPT
-
-  }
-  linking_mode_flags { mode: DYNAMIC }
-
-
-
-  feature {
-    name: 'fdo_optimize'
-    provides: 'profile'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      expand_if_all_available: 'fdo_profile_path'
-      flag_group {
-        flag: '-fprofile-use=%{fdo_profile_path}'
-        flag: '-fprofile-correction',
-      }
-    }
-  }
-}
-
-toolchain {
-  toolchain_identifier: "msys_x64_mingw"
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "mingw-gcc"
-  host_system_name: "local"
-  needsPic: false
-  target_libc: "mingw"
-  target_cpu: "x64_windows"
-  target_system_name: "local"
-
-  artifact_name_pattern {
-     category_name: 'executable'
-     prefix: ''
-     extension: '.exe'
-  }
-
-   tool_path { name: "ar" path: "c:/tools/msys64/mingw64/bin/ar" }
-   tool_path { name: "compat-ld" path: "c:/tools/msys64/mingw64/bin/ld" }
-   tool_path { name: "cpp" path: "c:/tools/msys64/mingw64/bin/cpp" }
-   tool_path { name: "dwp" path: "c:/tools/msys64/mingw64/bin/dwp" }
-   tool_path { name: "gcc" path: "c:/tools/msys64/mingw64/bin/gcc" }
-   artifact_name_pattern { category_name: "executable" prefix: "" extension: ".exe"}
-   cxx_flag: "-std=gnu++0x"
-   linker_flag: "-lstdc++"
-   cxx_builtin_include_directory: "c:/tools/msys64/mingw64/"
-   tool_path { name: "gcov" path: "c:/tools/msys64/mingw64/bin/gcov" }
-   tool_path { name: "ld" path: "c:/tools/msys64/mingw64/bin/ld" }
-   tool_path { name: "nm" path: "c:/tools/msys64/mingw64/bin/nm" }
-   tool_path { name: "objcopy" path: "c:/tools/msys64/mingw64/bin/objcopy" }
-   objcopy_embed_flag: "-I"
-   objcopy_embed_flag: "binary"
-   tool_path { name: "objdump" path: "c:/tools/msys64/mingw64/bin/objdump" }
-   tool_path { name: "strip" path: "c:/tools/msys64/mingw64/bin/strip" }   feature { name: "targets_windows" implies: "copy_dynamic_libraries_to_binary" enabled: true }   feature { name: "copy_dynamic_libraries_to_binary" }
-
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  toolchain_identifier: "msvc_x64"
-  # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
-  cxx_builtin_include_directory: "C:\\botcode\\w"
-  host_system_name: "local"
-  target_system_name: "local"
-
-  abi_version: "local"
-  abi_libc_version: "local"
-  target_cpu: "x64_windows"
-  compiler: "msvc-cl"
-  target_libc: "msvcrt"
-  default_python_version: "python2.7"
-
-cxx_builtin_include_directory: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\INCLUDE"
-cxx_builtin_include_directory: "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.10240.0\\ucrt"
-cxx_builtin_include_directory: "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\shared"
-cxx_builtin_include_directory: "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\um"
-cxx_builtin_include_directory: "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\winrt"
-
-  tool_path {
-    name: "ar"
-    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/lib.exe"
-  }
-  tool_path {
-    name: "ml"
-    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/ml64.exe"
-  }
-  tool_path {
-    name: "cpp"
-    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/cl.exe"
-  }
-  tool_path {
-    name: "gcc"
-    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/cl.exe"
-  }
-  tool_path {
-    name: "gcov"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "ld"
-    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/link.exe"
-  }
-  tool_path {
-    name: "nm"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objcopy"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objdump"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "strip"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  supports_gold_linker: false
-  supports_start_end_lib: false
-  supports_interface_shared_objects: true
-  supports_incremental_linker: false
-  supports_normalizing_ar: true
-  needsPic: false
-
-  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
-  compiler_flag: "/DCOMPILER_MSVC"
-
-  # Don't define min/max macros in windows.h.
-  compiler_flag: "/DNOMINMAX"
-
-  # Platform defines.
-  compiler_flag: "/D_WIN32_WINNT=0x0600"
-  # Turn off warning messages.
-  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
-  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
-
-  # Useful options to have on for compilation.
-  # Increase the capacity of object files to 2^32 sections.
-  compiler_flag: "/bigobj"
-  # Allocate 500MB for precomputed headers.
-  compiler_flag: "/Zm500"
-  # Catch C++ exceptions only and tell the compiler to assume that functions declared
-  # as extern "C" never throw a C++ exception.
-  compiler_flag: "/EHsc"
-
-  # Globally disabled warnings.
-  # Don't warn about elements of array being be default initialized.
-  compiler_flag: "/wd4351"
-  # Don't warn about no matching delete found.
-  compiler_flag: "/wd4291"
-  # Don't warn about diamond inheritance patterns.
-  compiler_flag: "/wd4250"
-  # Don't warn about insecure functions (e.g. non _s functions).
-  compiler_flag: "/wd4996"
-
-  linker_flag: "/MACHINE:X64"
-
-  feature {
-    name: "no_legacy_features"
-  }
-
-  artifact_name_pattern {
-     category_name: 'object_file'
-     prefix: ''
-     extension: '.obj'
-  }
-
-  artifact_name_pattern {
-     category_name: 'static_library'
-     prefix: ''
-     extension: '.lib'
-  }
-
-  artifact_name_pattern {
-     category_name: 'alwayslink_static_library'
-     prefix: ''
-     extension: '.lo.lib'
-  }
-
-  artifact_name_pattern {
-     category_name: 'executable'
-     prefix: ''
-     extension: '.exe'
-  }
-
-  artifact_name_pattern {
-     category_name: 'dynamic_library'
-     prefix: ''
-     extension: '.dll'
-  }
-
-  artifact_name_pattern {
-     category_name: 'interface_library'
-     prefix: ''
-     extension: '.if.lib'
-  }
-
-  # Suppress startup banner.
-  feature {
-    name: "nologo"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      flag_group {
-        flag: "/nologo"
-      }
-    }
-  }
-
-  feature {
-    name: 'has_configured_linker_path'
-  }
-
-  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
-  feature {
-    name: 'no_stripping'
-  }
-
-  # This feature indicates this is a toolchain targeting Windows.
-  feature {
-    name: 'targets_windows'
-    implies: 'copy_dynamic_libraries_to_binary'
-    enabled: true
-  }
-
-  feature {
-    name: 'copy_dynamic_libraries_to_binary'
-  }
-
-  action_config {
-    config_name: 'assemble'
-    action_name: 'assemble'
-    tool {
-      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/ml64.exe'
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'preprocess-assemble'
-    action_name: 'preprocess-assemble'
-    tool {
-      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/ml64.exe'
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'c-compile'
-    action_name: 'c-compile'
-    tool {
-      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/cl.exe'
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-compile'
-    action_name: 'c++-compile'
-    tool {
-      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/cl.exe'
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-link-executable'
-    action_name: 'c++-link-executable'
-    tool {
-      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/link.exe'
-    }
-    implies: 'nologo'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-  }
-
-  action_config {
-    config_name: 'c++-link-dynamic-library'
-    action_name: 'c++-link-dynamic-library'
-    tool {
-      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/link.exe'
-    }
-    implies: 'nologo'
-    implies: 'shared_flag'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-    implies: 'has_configured_linker_path'
-    implies: 'def_file'
-  }
-
-  action_config {
-      config_name: 'c++-link-nodeps-dynamic-library'
-      action_name: 'c++-link-nodeps-dynamic-library'
-      tool {
-        tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/link.exe'
-      }
-      implies: 'nologo'
-      implies: 'shared_flag'
-      implies: 'linkstamps'
-      implies: 'output_execpath_flags'
-      implies: 'input_param_flags'
-      implies: 'user_link_flags'
-      implies: 'legacy_link_flags'
-      implies: 'linker_subsystem_flag'
-      implies: 'linker_param_file'
-      implies: 'msvc_env'
-      implies: 'no_stripping'
-      implies: 'has_configured_linker_path'
-      implies: 'def_file'
-    }
-
-  action_config {
-    config_name: 'c++-link-static-library'
-    action_name: 'c++-link-static-library'
-    tool {
-      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/lib.exe'
-    }
-    implies: 'nologo'
-    implies: 'archiver_flags'
-    implies: 'input_param_flags'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-  }
-
-  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
-  # not used in this crosstool
-  feature {
-    name: 'legacy_compile_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'legacy_compile_flags'
-        flag: '%{legacy_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: "msvc_env"
-    env_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      env_entry {
-        key: "PATH"
-        value: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\BIN\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Windows\\Microsoft.NET\\Framework64\\;C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x64;C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x86;;C:\\Windows\\system32"
-      }
-      env_entry {
-        key: "TMP"
-        value: "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp"
-      }
-      env_entry {
-        key: "TEMP"
-        value: "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp"
-      }
-    }
-    implies: 'msvc_compile_env'
-    implies: 'msvc_link_env'
-  }
-
-  feature {
-    name: "msvc_compile_env"
-    env_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      env_entry {
-        key: "INCLUDE"
-        value: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\INCLUDE;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.10240.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\shared;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\um;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\winrt;"
-      }
-    }
-  }
-
-  feature {
-    name: "msvc_link_env"
-    env_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      env_entry {
-        key: "LIB"
-        value: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\LIB\\amd64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.10240.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\8.1\\lib\\winv6.3\\um\\x64;"
-      }
-    }
-  }
-
-  feature {
-    name: 'include_paths'
-    flag_set {
-      action: "assemble"
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      flag_group {
-        iterate_over: 'quote_include_paths'
-        flag: '/I%{quote_include_paths}'
-      }
-      flag_group {
-        iterate_over: 'include_paths'
-        flag: '/I%{include_paths}'
-      }
-      flag_group {
-        iterate_over: 'system_include_paths'
-        flag: '/I%{system_include_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: "preprocessor_defines"
-    flag_set {
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-header-parsing"
-      action: "c++-module-compile"
-      flag_group {
-        flag: "/D%{preprocessor_defines}"
-        iterate_over: "preprocessor_defines"
-      }
-    }
-  }
-
-  # Tell Bazel to parse the output of /showIncludes
-  feature {
-    name: 'parse_showincludes'
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-module-compile'
-      action: 'c++-header-parsing'
-      flag_group {
-        flag: "/showIncludes"
-      }
-    }
-  }
-
-
-  feature {
-    name: 'generate_pdb_file'
-    requires: {
-      feature: 'dbg'
-    }
-    requires: {
-      feature: 'fastbuild'
-    }
-  }
-
-  feature {
-    name: 'shared_flag'
-    flag_set {
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/DLL'
-      }
-    }
-  }
-
-  feature {
-    name: 'linkstamps'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      expand_if_all_available: 'linkstamp_paths'
-      flag_group {
-        iterate_over: 'linkstamp_paths'
-        flag: '%{linkstamp_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: 'output_execpath_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'archiver_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'input_param_flags'
-    flag_set {
-      expand_if_all_available: 'interface_library_output_path'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/IMPLIB:%{interface_library_output_path}"
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libopts'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'libopts'
-        flag: '%{libopts}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libraries_to_link'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        iterate_over: 'libraries_to_link'
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file_group'
-          }
-          iterate_over: 'libraries_to_link.object_files'
-          flag_group {
-            flag: '%{libraries_to_link.object_files}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'interface_library'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'static_library'
-          }
-          flag_group {
-            expand_if_false: 'libraries_to_link.is_whole_archive'
-            flag: '%{libraries_to_link.name}'
-          }
-          flag_group {
-            expand_if_true: 'libraries_to_link.is_whole_archive'
-            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
-          }
-        }
-      }
-    }
-  }
-
-  # Since this feature is declared earlier in the CROSSTOOL than
-  # "user_link_flags", this feature will be applied prior to it anwyhere they
-  # are both implied. And since "user_link_flags" contains the linkopts from
-  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
-  # file.
-  feature {
-    name: 'linker_subsystem_flag'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/SUBSYSTEM:CONSOLE'
-      }
-    }
-  }
-
-  # The "user_link_flags" contains user-defined linkopts (from build rules)
-  # so it should be defined after features that declare user-overridable flags.
-  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
-  # but we want to let the user override it, therefore "link_flag_subsystem" is
-  # defined earlier in the CROSSTOOL file than "user_link_flags".
-  feature {
-    name: 'user_link_flags'
-    flag_set {
-      expand_if_all_available: 'user_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'user_link_flags'
-        flag: '%{user_link_flags}'
-      }
-    }
-  }
-  feature {
-    name: 'legacy_link_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'legacy_link_flags'
-        flag: '%{legacy_link_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'linker_param_file'
-    flag_set {
-      expand_if_all_available: 'linker_param_file'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '@%{linker_param_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'static_link_msvcrt'
-  }
-
-  feature {
-    name: 'static_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MT"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MD"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'static_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MTd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MDd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dbg'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FULL"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'fastbuild'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FASTLINK"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'opt'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/O2" # Implies /Og /Oi /Ot /Oy /Ob2 /Gs /GF /Gy
-      }
-    }
-    implies: 'frame_pointer'
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  # Must come after /O1, /O2 and /Ox.
-  feature {
-    name: "frame_pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "/Oy-"
-      }
-    }
-  }
-
-  # Remove assert/DCHECKs in opt mode.
-  # You can have them back with --features=-disable_assertions.
-  feature {
-    name: 'disable_assertions'
-    enabled: true
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      with_feature: {
-        feature: 'opt'
-      }
-      flag_group {
-        flag: "/DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    enabled: true
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        # TODO: detect clang on Windows and use "-Wno-builtin-macro-redefined"
-        flag: "/wd4117" # Trying to define or undefine a predefined macro
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  feature {
-    name: 'treat_warnings_as_errors'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/WX"
-      }
-    }
-  }
-
-  # Trade slower build time for smaller binary
-  feature {
-    name: 'smaller_binary'
-    enabled: true
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      with_feature: {
-        feature: 'opt'
-      }
-      flag_group {
-        flag: "/Gy" # Enable function-level linking (-ffunction-sections)
-        flag: "/Gw" # Optimize global data (-fdata-sections)
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library',
-      action: 'c++-link-nodeps-dynamic-library'
-      with_feature: {
-        feature: 'opt'
-      }
-      flag_group {
-        flag: '/OPT:ICF' # Fold identical functions
-        flag: '/OPT:REF' # Eliminate unreferenced functions and data
-      }
-    }
-  }
-
-  # Suppress warnings that most users do not care
-  feature {
-    name: 'ignore_noisy_warnings'
-    enabled: true
-    flag_set {
-      action: 'c++-link-static-library'
-      flag_group {
-        # Suppress 'object file does not define any public symbols' warning
-        flag: '/ignore:4221'
-      }
-    }
-  }
-
-  feature {
-    name: 'user_compile_flags'
-    flag_set {
-      expand_if_all_available: 'user_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'user_compile_flags'
-        flag: '%{user_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'sysroot'
-    flag_set {
-      expand_if_all_available: 'sysroot'
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'sysroot'
-        flag: '--sysroot=%{sysroot}'
-      }
-    }
-  }
-
-  feature {
-    name: 'unfiltered_compile_flags'
-    flag_set {
-      expand_if_all_available: 'unfiltered_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'unfiltered_compile_flags'
-        flag: '%{unfiltered_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_output_flags'
-    flag_set {
-      action: 'assemble'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-        flag: '/Zi'
-      }
-    }
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_assembly_file'
-        flag: '/Fa%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_preprocess_file'
-        flag: '/P'
-        flag: '/Fi%{output_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_input_flags'
-    flag_set {
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'source_file'
-        flag: '/c'
-        flag: '%{source_file}'
-      }
-    }
-  }
-
-  feature {
-    name : 'def_file',
-    flag_set {
-      expand_if_all_available: 'def_file_path'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEF:%{def_file_path}"
-        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
-        # the warning message about DLL name doesn't match the default one.
-        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
-        flag: "/ignore:4070"
-      }
-    }
-  }
-
-  feature {
-    name: 'windows_export_all_symbols'
-  }
-
-  feature {
-    name: 'no_windows_export_all_symbols'
-  }
-
-  linking_mode_flags { mode: DYNAMIC }
-}
-
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD b/third_party/toolchains/preconfig/win_1803/bazel_025/BUILD
similarity index 83%
rename from third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
rename to third_party/toolchains/preconfig/win_1803/bazel_025/BUILD
index dcb2980162d..73da9d3defc 100644
--- a/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/bazel_025/BUILD
@@ -14,18 +14,14 @@
 
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
-licenses(["restricted"])
-
 package(default_visibility = ["//visibility:public"])
 
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
 cc_library(
     name = "malloc",
 )
 
-cc_library(
-    name = "stl",
-)
-
 filegroup(
     name = "empty",
     srcs = [],
@@ -48,16 +44,24 @@ cc_toolchain_suite(
 cc_toolchain(
     name = "cc-compiler-x64_windows_msys",
     all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
     compiler_files = ":empty",
-    cpu = "local",
     dwp_files = ":empty",
     linker_files = ":empty",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_config = ":msys_x64",
     toolchain_identifier = "msys_x64",
 )
 
+cc_toolchain_config(
+    name = "msys_x64",
+    compiler = "msys-gcc",
+    cpu = "x64_windows",
+)
+
 toolchain(
     name = "cc-toolchain-x64_windows_msys",
     exec_compatible_with = [
@@ -76,16 +80,24 @@ toolchain(
 cc_toolchain(
     name = "cc-compiler-x64_windows_mingw",
     all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
     compiler_files = ":empty",
-    cpu = "x64_windows",
     dwp_files = ":empty",
     linker_files = ":empty",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 0,
+    toolchain_config = ":msys_x64_mingw",
     toolchain_identifier = "msys_x64_mingw",
 )
 
+cc_toolchain_config(
+    name = "msys_x64_mingw",
+    compiler = "mingw-gcc",
+    cpu = "x64_windows",
+)
+
 toolchain(
     name = "cc-toolchain-x64_windows_mingw",
     exec_compatible_with = [
@@ -104,16 +116,24 @@ toolchain(
 cc_toolchain(
     name = "cc-compiler-x64_windows",
     all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
     compiler_files = ":empty",
-    cpu = "x64_windows",
     dwp_files = ":empty",
     linker_files = ":empty",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_config = ":msvc_x64",
     toolchain_identifier = "msvc_x64",
 )
 
+cc_toolchain_config(
+    name = "msvc_x64",
+    compiler = "msvc-cl",
+    cpu = "x64_windows",
+)
+
 toolchain(
     name = "cc-toolchain-x64_windows",
     exec_compatible_with = [
@@ -131,16 +151,24 @@ toolchain(
 cc_toolchain(
     name = "cc-compiler-armeabi-v7a",
     all_files = ":empty",
+    ar_files = ":empty",
+    as_files = ":empty",
     compiler_files = ":empty",
-    cpu = "local",
     dwp_files = ":empty",
     linker_files = ":empty",
     objcopy_files = ":empty",
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_config = ":stub_armeabi-v7a",
     toolchain_identifier = "stub_armeabi-v7a",
 )
 
+cc_toolchain_config(
+    name = "stub_armeabi-v7a",
+    compiler = "compiler",
+    cpu = "armeabi-v7a",
+)
+
 toolchain(
     name = "cc-toolchain-armeabi-v7a",
     exec_compatible_with = [
@@ -152,3 +180,8 @@ toolchain(
     toolchain = ":cc-compiler-armeabi-v7a",
     toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
 )
+
+filegroup(
+    name = "link_dynamic_library",
+    srcs = ["link_dynamic_library.sh"],
+)
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_025/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/win_1803/bazel_025/cc_toolchain_config.bzl
new file mode 100644
index 00000000000..59dc9251257
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/bazel_025/cc_toolchain_config.bzl
@@ -0,0 +1,1719 @@
+# Copyright 2019 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Starlark cc_toolchain configuration rule"""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "make_variable",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+all_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+    ACTION_NAMES.lto_backend,
+]
+
+all_cpp_compile_actions = [
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.clif_match,
+]
+
+preprocessor_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_header_parsing,
+    ACTION_NAMES.cpp_module_compile,
+    ACTION_NAMES.clif_match,
+]
+
+codegen_compile_actions = [
+    ACTION_NAMES.c_compile,
+    ACTION_NAMES.cpp_compile,
+    ACTION_NAMES.linkstamp_compile,
+    ACTION_NAMES.assemble,
+    ACTION_NAMES.preprocess_assemble,
+    ACTION_NAMES.cpp_module_codegen,
+    ACTION_NAMES.lto_backend,
+]
+
+all_link_actions = [
+    ACTION_NAMES.cpp_link_executable,
+    ACTION_NAMES.cpp_link_dynamic_library,
+    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+]
+
+def _windows_msvc_impl(ctx):
+    toolchain_identifier = "msvc_x64"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "msvcrt"
+    compiler = "msvc-cl"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+
+    cxx_builtin_include_directories = [
+        # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+        "C:\\botcode\\w",
+        "c:/tools/msys64/usr/",
+        "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\INCLUDE",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.10240.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\winrt",
+    ]
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/link.exe")],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/lib.exe")],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/ml64.exe")],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/ml64.exe")],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/cl.exe")],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "default_compile_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/cl.exe")],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/link.exe")],
+    )
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "default_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/link.exe")],
+    )
+
+    action_configs = [
+        assemble_action,
+        preprocess_assemble_action,
+        c_compile_action,
+        cpp_compile_action,
+        cpp_link_executable_action,
+        cpp_link_dynamic_library_action,
+        cpp_link_nodeps_dynamic_library_action,
+        cpp_link_static_library_action,
+    ]
+
+    msvc_link_env_feature = feature(
+        name = "msvc_link_env",
+        env_sets = [
+            env_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                env_entries = [env_entry(key = "LIB", value = "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\LIB\\amd64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.10240.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\8.1\\lib\\winv6.3\\um\\x64;")],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/wd4117",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{unfiltered_compile_flags}"],
+                        iterate_over = "unfiltered_compile_flags",
+                        expand_if_available = "unfiltered_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    compiler_param_file_feature = feature(
+        name = "compiler_param_file",
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{libopts}"],
+                        iterate_over = "libopts",
+                        expand_if_available = "libopts",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fastbuild_feature = feature(
+        name = "fastbuild",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                    flag_group(
+                        flags = ["/MACHINE:X64"],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    dbg_feature = feature(
+        name = "dbg",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"],
+                    ),
+                ],
+            ),
+        ],
+        implies = ["generate_pdb_file"],
+    )
+
+    opt_feature = feature(
+        name = "opt",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/O2"])],
+            ),
+        ],
+        implies = ["frame_pointer"],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0601",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_compile_env_feature = feature(
+        name = "msvc_compile_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                env_entries = [env_entry(key = "INCLUDE", value = "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\INCLUDE;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.10240.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\shared;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\um;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\winrt;")],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable_assertions",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/DNDEBUG"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    ignore_noisy_warnings_feature = feature(
+        name = "ignore_noisy_warnings",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [flag_group(flags = ["/ignore:4221"])],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    treat_warnings_as_errors_feature = feature(
+        name = "treat_warnings_as_errors",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/WX"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    frame_pointer_feature = feature(
+        name = "frame_pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Oy-"])],
+            ),
+        ],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_not_available = "output_preprocess_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    smaller_binary_feature = feature(
+        name = "smaller_binary",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\BIN\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Windows\\Microsoft.NET\\Framework64\\;C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x64;C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x86;;C:\\Windows\\system32"),
+                    env_entry(key = "TMP", value = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp"),
+                    env_entry(key = "TEMP", value = "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp"),
+                ],
+            ),
+        ],
+        implies = ["msvc_compile_env", "msvc_link_env"],
+    )
+
+    features = [
+        no_legacy_features_feature,
+        nologo_feature,
+        has_configured_linker_path_feature,
+        no_stripping_feature,
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        default_compile_flags_feature,
+        msvc_env_feature,
+        msvc_compile_env_feature,
+        msvc_link_env_feature,
+        include_paths_feature,
+        preprocessor_defines_feature,
+        parse_showincludes_feature,
+        generate_pdb_file_feature,
+        shared_flag_feature,
+        linkstamps_feature,
+        output_execpath_flags_feature,
+        archiver_flags_feature,
+        input_param_flags_feature,
+        linker_subsystem_flag_feature,
+        user_link_flags_feature,
+        default_link_flags_feature,
+        linker_param_file_feature,
+        static_link_msvcrt_feature,
+        static_link_msvcrt_no_debug_feature,
+        dynamic_link_msvcrt_no_debug_feature,
+        static_link_msvcrt_debug_feature,
+        dynamic_link_msvcrt_debug_feature,
+        dbg_feature,
+        fastbuild_feature,
+        opt_feature,
+        frame_pointer_feature,
+        disable_assertions_feature,
+        determinism_feature,
+        treat_warnings_as_errors_feature,
+        smaller_binary_feature,
+        ignore_noisy_warnings_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+        compiler_param_file_feature,
+        compiler_output_flags_feature,
+        compiler_input_flags_feature,
+        def_file_feature,
+        windows_export_all_symbols_feature,
+        no_windows_export_all_symbols_feature,
+        supports_dynamic_linker_feature,
+        supports_interface_shared_libraries_feature,
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "object_file",
+            prefix = "",
+            extension = ".obj",
+        ),
+        artifact_name_pattern(
+            category_name = "static_library",
+            prefix = "",
+            extension = ".lib",
+        ),
+        artifact_name_pattern(
+            category_name = "alwayslink_static_library",
+            prefix = "",
+            extension = ".lo.lib",
+        ),
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+        artifact_name_pattern(
+            category_name = "dynamic_library",
+            prefix = "",
+            extension = ".dll",
+        ),
+        artifact_name_pattern(
+            category_name = "interface_library",
+            prefix = "",
+            extension = ".if.lib",
+        ),
+    ]
+
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/lib.exe"),
+        tool_path(name = "ml", path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/ml64.exe"),
+        tool_path(name = "cpp", path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/cl.exe"),
+        tool_path(name = "gcc", path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/cl.exe"),
+        tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(name = "ld", path = "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC//bin/amd64/link.exe"),
+        tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+        tool_path(
+            name = "objcopy",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "objdump",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+        tool_path(
+            name = "strip",
+            path = "wrapper/bin/msvc_nop.bat",
+        ),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = None,
+    )
+
+def _windows_msys_mingw_impl(ctx):
+    toolchain_identifier = "msys_x64_mingw"
+    host_system_name = "local"
+    target_system_name = "local"
+    target_cpu = "x64_windows"
+    target_libc = "mingw"
+    compiler = "mingw-gcc"
+    abi_version = "local"
+    abi_libc_version = "local"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "c:/tools/msys64/mingw64/bin"),
+                ],
+            ),
+        ],
+    )
+
+    msys_mingw_flags = [
+        "-std=gnu++0x",
+    ]
+    msys_mingw_link_flags = [
+        "-lstdc++",
+    ]
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = msys_mingw_flags)] if msys_mingw_flags else []),
+            ),
+        ],
+    )
+
+    compiler_param_file_feature = feature(
+        name = "compiler_param_file",
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = msys_mingw_link_flags)] if msys_mingw_link_flags else []),
+            ),
+        ],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+        default_compile_flags_feature,
+        compiler_param_file_feature,
+        default_link_flags_feature,
+        supports_dynamic_linker_feature,
+    ]
+
+    cxx_builtin_include_directories = [
+        # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+        "C:\\botcode\\w",
+        "c:/tools/msys64/mingw64/",
+        "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\INCLUDE",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.10240.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\winrt",
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(
+            category_name = "executable",
+            prefix = "",
+            extension = ".exe",
+        ),
+    ]
+
+    make_variables = []
+    tool_paths = [
+        tool_path(name = "ar", path = "c:/tools/msys64/mingw64/bin/ar"),
+        tool_path(name = "compat-ld", path = "c:/tools/msys64/mingw64/bin/ld"),
+        tool_path(name = "cpp", path = "c:/tools/msys64/mingw64/bin/cpp"),
+        tool_path(name = "dwp", path = "c:/tools/msys64/mingw64/bin/dwp"),
+        tool_path(name = "gcc", path = "c:/tools/msys64/mingw64/bin/gcc"),
+        tool_path(name = "gcov", path = "c:/tools/msys64/mingw64/bin/gcov"),
+        tool_path(name = "ld", path = "c:/tools/msys64/mingw64/bin/ld"),
+        tool_path(name = "nm", path = "c:/tools/msys64/mingw64/bin/nm"),
+        tool_path(name = "objcopy", path = "c:/tools/msys64/mingw64/bin/objcopy"),
+        tool_path(name = "objdump", path = "c:/tools/msys64/mingw64/bin/objdump"),
+        tool_path(name = "strip", path = "c:/tools/msys64/mingw64/bin/strip"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _armeabi_impl(ctx):
+    toolchain_identifier = "stub_armeabi-v7a"
+    host_system_name = "armeabi-v7a"
+    target_system_name = "armeabi-v7a"
+    target_cpu = "armeabi-v7a"
+    target_libc = "armeabi-v7a"
+    compiler = "compiler"
+    abi_version = "armeabi-v7a"
+    abi_libc_version = "armeabi-v7a"
+    cc_target_os = None
+    builtin_sysroot = None
+    action_configs = []
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+    features = [supports_dynamic_linker_feature, supports_pic_feature]
+
+    cxx_builtin_include_directories = [
+        # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+        "C:\\botcode\\w",
+    ]
+    artifact_name_patterns = []
+    make_variables = []
+
+    tool_paths = [
+        tool_path(name = "ar", path = "/bin/false"),
+        tool_path(name = "compat-ld", path = "/bin/false"),
+        tool_path(name = "cpp", path = "/bin/false"),
+        tool_path(name = "dwp", path = "/bin/false"),
+        tool_path(name = "gcc", path = "/bin/false"),
+        tool_path(name = "gcov", path = "/bin/false"),
+        tool_path(name = "ld", path = "/bin/false"),
+        tool_path(name = "nm", path = "/bin/false"),
+        tool_path(name = "objcopy", path = "/bin/false"),
+        tool_path(name = "objdump", path = "/bin/false"),
+        tool_path(name = "strip", path = "/bin/false"),
+    ]
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = toolchain_identifier,
+        host_system_name = host_system_name,
+        target_system_name = target_system_name,
+        target_cpu = target_cpu,
+        target_libc = target_libc,
+        compiler = compiler,
+        abi_version = abi_version,
+        abi_libc_version = abi_libc_version,
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = builtin_sysroot,
+        cc_target_os = cc_target_os,
+    )
+
+def _impl(ctx):
+    if ctx.attr.cpu == "armeabi-v7a":
+        return _armeabi_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "msvc-cl":
+        return _windows_msvc_impl(ctx)
+    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
+        return _windows_msys_mingw_impl(ctx)
+
+    tool_paths = [
+        tool_path(name = "ar", path = "c:/tools/msys64/usr/bin/ar"),
+        tool_path(name = "compat-ld", path = "c:/tools/msys64/usr/bin/ld"),
+        tool_path(name = "cpp", path = "c:/tools/msys64/usr/bin/cpp"),
+        tool_path(name = "dwp", path = "c:/tools/msys64/usr/bin/dwp"),
+        tool_path(name = "gcc", path = "c:/tools/msys64/usr/bin/gcc"),
+        tool_path(name = "gcov", path = "c:/tools/msys64/usr/bin/gcov"),
+        tool_path(name = "ld", path = "c:/tools/msys64/usr/bin/ld"),
+        tool_path(name = "nm", path = "c:/tools/msys64/usr/bin/nm"),
+        tool_path(name = "objcopy", path = "c:/tools/msys64/usr/bin/objcopy"),
+        tool_path(name = "objdump", path = "c:/tools/msys64/usr/bin/objdump"),
+        tool_path(name = "strip", path = "c:/tools/msys64/usr/bin/strip"),
+    ]
+
+    cxx_builtin_include_directories = [
+        # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+        "C:\\botcode\\w",
+        "c:/tools/msys64/usr/",
+        "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\INCLUDE",
+        "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.10240.0\\ucrt",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\shared",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\um",
+        "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\winrt",
+    ]
+
+    action_configs = []
+
+    compile_flags = [
+    ]
+
+    dbg_compile_flags = [
+    ]
+
+    opt_compile_flags = [
+    ]
+
+    cxx_flags = [
+        "-std=gnu++0x",
+    ]
+
+    link_flags = [
+        "-lstdc++",
+    ]
+
+    opt_link_flags = [
+    ]
+
+    unfiltered_compile_flags = [
+    ]
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        implies = ["copy_dynamic_libraries_to_binary"],
+        enabled = True,
+    )
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    gcc_env_feature = feature(
+        name = "gcc_env",
+        enabled = True,
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = "c:/tools/msys64/usr/bin"),
+                ],
+            ),
+        ],
+    )
+
+    windows_features = [
+        targets_windows_feature,
+        copy_dynamic_libraries_to_binary_feature,
+        gcc_env_feature,
+    ]
+
+    supports_pic_feature = feature(
+        name = "supports_pic",
+        enabled = True,
+    )
+    supports_start_end_lib_feature = feature(
+        name = "supports_start_end_lib",
+        enabled = True,
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = compile_flags)] if compile_flags else []),
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = dbg_compile_flags)] if dbg_compile_flags else []),
+                with_features = [with_feature_set(features = ["dbg"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = opt_compile_flags)] if opt_compile_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = cxx_flags)] if cxx_flags else []),
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = link_flags)] if link_flags else []),
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = ([flag_group(flags = opt_link_flags)] if opt_link_flags else []),
+                with_features = [with_feature_set(features = ["opt"])],
+            ),
+        ],
+    )
+
+    dbg_feature = feature(name = "dbg")
+
+    opt_feature = feature(name = "opt")
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    fdo_optimize_feature = feature(
+        name = "fdo_optimize",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-fprofile-use=%{fdo_profile_path}",
+                            "-fprofile-correction",
+                        ],
+                        expand_if_available = "fdo_profile_path",
+                    ),
+                ],
+            ),
+        ],
+        provides = ["profile"],
+    )
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = ([flag_group(flags = unfiltered_compile_flags)] if unfiltered_compile_flags else []),
+            ),
+        ],
+    )
+
+    features = windows_features + [
+        supports_pic_feature,
+        default_compile_flags_feature,
+        default_link_flags_feature,
+        fdo_optimize_feature,
+        supports_dynamic_linker_feature,
+        dbg_feature,
+        opt_feature,
+        user_compile_flags_feature,
+        sysroot_feature,
+        unfiltered_compile_flags_feature,
+    ]
+
+    artifact_name_patterns = [
+        artifact_name_pattern(category_name = "executable", prefix = "", extension = ".exe"),
+    ]
+
+    make_variables = []
+
+    return cc_common.create_cc_toolchain_config_info(
+        ctx = ctx,
+        features = features,
+        action_configs = action_configs,
+        artifact_name_patterns = artifact_name_patterns,
+        cxx_builtin_include_directories = cxx_builtin_include_directories,
+        toolchain_identifier = "msys_x64",
+        host_system_name = "local",
+        target_system_name = "local",
+        target_cpu = "x64_windows",
+        target_libc = "msys",
+        compiler = "msys-gcc",
+        abi_version = "local",
+        abi_libc_version = "local",
+        tool_paths = tool_paths,
+        make_variables = make_variables,
+        builtin_sysroot = "",
+        cc_target_os = None,
+    )
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True),
+        "compiler": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+)
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl b/third_party/toolchains/preconfig/win_1803/bazel_025/dummy_toolchain.bzl
similarity index 100%
rename from third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
rename to third_party/toolchains/preconfig/win_1803/bazel_025/dummy_toolchain.bzl
diff --git a/tools/tf_env_collect.sh b/tools/tf_env_collect.sh
index 4ceaa7ac92b..0900b7213db 100755
--- a/tools/tf_env_collect.sh
+++ b/tools/tf_env_collect.sh
@@ -32,16 +32,62 @@ OUTPUT_FILE=tf_env.txt
 python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
 
 {
-  echo
-  echo "== cat /etc/issue ==============================================="
-  uname -a
-  uname=`uname -s`
-  if [ "$(uname)" == "Darwin" ]; then
-    echo Mac OS X `sw_vers -productVersion`
-  elif [ "$(uname)" == "Linux" ]; then
-    cat /etc/*release | grep VERSION
-  fi
-  
+echo
+echo '== check python ==================================================='
+} >> ${OUTPUT_FILE}
+
+cat <<EOF > /tmp/check_python.py
+import platform
+
+print("""python version: %s
+python branch: %s
+python build version: %s
+python compiler version: %s
+python implementation: %s
+""" % (
+platform.python_version(),
+platform.python_branch(),
+platform.python_build(),
+platform.python_compiler(),
+platform.python_implementation(),
+))
+EOF
+${python_bin_path} /tmp/check_python.py 2>&1  >> ${OUTPUT_FILE}
+
+{
+echo
+echo '== check os platform ==============================================='
+} >> ${OUTPUT_FILE}
+
+cat <<EOF > /tmp/check_os.py
+import platform
+
+print("""os: %s
+os kernel version: %s
+os release version: %s
+os platform: %s
+linux distribution: %s
+linux os distribution: %s
+mac version: %s
+uname: %s
+architecture: %s
+machine: %s
+""" % (
+platform.system(),
+platform.version(),
+platform.release(),
+platform.platform(),
+platform.linux_distribution(),
+platform.dist(),
+platform.mac_ver(),
+platform.uname(),
+platform.architecture(),
+platform.machine(),
+))
+EOF
+${python_bin_path} /tmp/check_os.py 2>&1  >> ${OUTPUT_FILE}
+
+{
   echo
   echo '== are we in docker ============================================='
   num=`cat /proc/1/cgroup | grep docker | wc -l`;
@@ -55,10 +101,6 @@ python_bin_path=$(which python || which python3 || die "Cannot find Python binar
   echo '== compiler ====================================================='
   c++ --version 2>&1
   
-  echo
-  echo '== uname -a ====================================================='
-  uname -a
-  
   echo
   echo '== check pips ==================================================='
   pip list 2>&1 | grep "proto\|numpy\|tensorflow"